2255 lines
81 KiB
C++
2255 lines
81 KiB
C++
/***************************************************************************
|
|
*
|
|
* collate.cpp
|
|
*
|
|
* $Id: collate.cpp 648752 2008-04-16 17:01:56Z faridz $
|
|
*
|
|
***************************************************************************
|
|
*
|
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
* contributor license agreements. See the NOTICE file distributed
|
|
* with this work for additional information regarding copyright
|
|
* ownership. The ASF licenses this file to you under the Apache
|
|
* License, Version 2.0 (the "License"); you may not use this file
|
|
* except in compliance with the License. You may obtain a copy of
|
|
* the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
|
* implied. See the License for the specific language governing
|
|
* permissions and limitations under the License.
|
|
*
|
|
* Copyright 2001-2008 Rogue Wave Software, Inc.
|
|
*
|
|
**************************************************************************/
|
|
|
|
#include "def.h" // for Def
|
|
#include "diagnostic.h" // for issue_diag()
|
|
#include "path.h" // for get_pathname()
|
|
#include "scanner.h" // for scanner
|
|
|
|
#include <fstream> // for ofstream
|
|
#include <iostream> // for cout
|
|
#include <string> // for string
|
|
#include <vector> // for vector
|
|
|
|
#include <cassert> // for assert()
|
|
#include <cctype> // for toupper()
|
|
#include <cstdio> // for sprintf()
|
|
#include <cstdlib> // for strtoul()
|
|
#include <cstring> // for strchr()
|
|
|
|
|
|
void Def::
|
|
create_wchar_utf8_table ()
|
|
{
|
|
if (!wchar_utf8_to_ext_.empty())
|
|
return;
|
|
|
|
typedef std::map<std::string, wchar_t>::const_iterator n_cmap_citer2;
|
|
|
|
const n_cmap_citer2 n_cmap_end = charmap_.get_mb_cmap ().end ();
|
|
|
|
for (n_cmap_citer2 n_cmap_it = charmap_.get_mb_cmap ().begin ();
|
|
n_cmap_it != n_cmap_end; ++n_cmap_it) {
|
|
|
|
const std::string wchar_utf8 = utf8_encode (n_cmap_it->second);
|
|
|
|
wchar_utf8_to_ext_.insert (std::make_pair (wchar_utf8,
|
|
n_cmap_it->first));
|
|
}
|
|
}
|
|
|
|
|
|
void Def::process_weights (collate_entry_t& entry)
|
|
{
|
|
// iterator of weights tokens
|
|
token_list_t::iterator w_it = entry.second.begin ();
|
|
|
|
w_cmap_iter wcmap_it;
|
|
ce_map_iter ce_map_it;
|
|
coll_map_iter coll_map_it;
|
|
|
|
Weights_t* weights = new Weights_t[collate_out_.num_weights];
|
|
|
|
std::size_t i = 0;
|
|
for (i = 0; i < collate_out_.num_weights && w_it != entry.second.end ();
|
|
++i, ++w_it){
|
|
get_weight (*w_it, weights, i);
|
|
}
|
|
|
|
const std::map<std::string, wchar_t>& w_cmap = charmap_.get_w_cmap ();
|
|
|
|
// the remaining weights - see above - are given as weight the
|
|
// collating value of the symbol at start of line
|
|
while (i < collate_out_.num_weights) {
|
|
weights[i].size = 1;
|
|
wcmap_it = w_cmap.find (entry.first.name);
|
|
if (wcmap_it != w_cmap.end()) {
|
|
coll_map_it = coll_map_.find (wcmap_it->second);
|
|
weights[i].weight[0] = coll_map_it->second.coll_val;
|
|
}
|
|
else if ((ce_map_it = ce_map_.find (entry.first.name)) !=
|
|
ce_map_.end()) {
|
|
weights[i].weight[0] = ce_map_it->second.coll_val;
|
|
}
|
|
else {
|
|
warnings_occurred_ =
|
|
issue_diag (W_COLSYM, false, &entry.first,
|
|
"symbolic name %s not defined as character or "
|
|
"collating-element, ignoring\n",
|
|
entry.first.name.c_str()) || warnings_occurred_;
|
|
}
|
|
i++;
|
|
}
|
|
|
|
wcmap_it = w_cmap.find (entry.first.name);
|
|
if (wcmap_it != w_cmap.end()) {
|
|
coll_map_it = coll_map_.find (wcmap_it->second);
|
|
coll_map_it->second.weights = weights;
|
|
} else if ((ce_map_it = ce_map_.find (entry.first.name)) != ce_map_.end()) {
|
|
ce_map_it->second.weights = weights;
|
|
} else {
|
|
warnings_occurred_ =
|
|
issue_diag (W_COLSYM, false, &entry.first,
|
|
"symbolic name %s not defined as character or "
|
|
"collating-element, ignoring\n",
|
|
entry.first.name.c_str()) || warnings_occurred_;
|
|
}
|
|
}
|
|
|
|
|
|
unsigned int Def::process_order_stmt (collate_section_t& section)
|
|
{
|
|
// number of orders in the section
|
|
collate_out_.num_weights = section.order.size ();
|
|
if (collate_out_.num_weights == 0) {
|
|
collate_out_.num_weights = 1;
|
|
collate_out_.weight_type[0] = 0;
|
|
} else {
|
|
collate_out_.num_weights = 0;
|
|
token_list_t::iterator ord_it = section.order.begin ();
|
|
for (; ord_it != section.order.end ();
|
|
++ord_it, ++collate_out_.num_weights) {
|
|
if (ord_it->token == Scanner::tok_forward) {
|
|
collate_out_.weight_type[collate_out_.num_weights] = 0;
|
|
} else if (ord_it->token == Scanner::tok_backward) {
|
|
collate_out_.weight_type[collate_out_.num_weights] = 1;
|
|
} else if (ord_it->token == Scanner::tok_position) {
|
|
// this is equivalent to "forward,position"
|
|
if (no_position_)
|
|
collate_out_.weight_type[collate_out_.num_weights] = 0;
|
|
else
|
|
collate_out_.weight_type[collate_out_.num_weights] = 2;
|
|
} else {
|
|
std::string::size_type comma_pos = ord_it->name.find (',');
|
|
if (comma_pos != std::string::npos) {
|
|
std::string first = ord_it->name.substr (0, comma_pos);
|
|
std::string second = ord_it->name.substr (comma_pos + 1);
|
|
|
|
// verify a valid weight ordering
|
|
if ((first == "forward" && second != "position") ||
|
|
first == "backward" && second != "position") {
|
|
issue_diag (E_COLORD, true,
|
|
&*ord_it, "unrecognized collating order\n");
|
|
break;
|
|
} else if (first == "forward") {
|
|
if (no_position_)
|
|
collate_out_.weight_type
|
|
[collate_out_.num_weights] = 0;
|
|
else
|
|
collate_out_.weight_type
|
|
[collate_out_.num_weights] = 2;
|
|
}
|
|
else
|
|
collate_out_.weight_type[collate_out_.num_weights] = 3;
|
|
}
|
|
else {
|
|
issue_diag (E_COLORD, true,
|
|
&*ord_it, "unrecognized collating order\n");
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// build the order value;
|
|
unsigned int order = 0;
|
|
for (int i = 0; i < collate_out_.num_weights; ++i) {
|
|
order <<= 2;
|
|
order |= collate_out_.weight_type[i];
|
|
}
|
|
|
|
return order;
|
|
}
|
|
|
|
|
|
// decimally increment the symbolic name, turning something like
|
|
// <U1234> into <U1245>, or <jis234> to <jis235>
|
|
static std::string
|
|
dec_increment (const std::string &sym)
|
|
{
|
|
const char *pdig = sym.c_str ();
|
|
|
|
for (; !(std::isdigit (*pdig)); ++pdig);
|
|
|
|
char *end;
|
|
|
|
const unsigned long sym_val = std::strtoul (pdig, &end, 10);
|
|
|
|
char numstr [64];
|
|
std::sprintf (numstr, "%lu", sym_val + 1);
|
|
|
|
std::string next_sym = sym.substr (0, pdig - sym.c_str ());
|
|
next_sym.append (numstr);
|
|
next_sym.append (end);
|
|
|
|
return next_sym;
|
|
}
|
|
|
|
|
|
// hexadecimally increment the symbolic name
|
|
static std::string
|
|
hex_increment (const std::string& sym)
|
|
{
|
|
const char *pdig = sym.c_str ();
|
|
|
|
for (; !(std::isxdigit (*pdig)); ++pdig);
|
|
|
|
char *end;
|
|
|
|
const unsigned long sym_val = std::strtoul (pdig, &end, 16);
|
|
|
|
char numstr [64];
|
|
std::sprintf (numstr, "%lx", sym_val + 1);
|
|
|
|
std::string next_sym = sym.substr (0, pdig - sym.c_str ());
|
|
next_sym.append (numstr);
|
|
next_sym.append (end);
|
|
|
|
return next_sym;
|
|
}
|
|
|
|
|
|
// scanning ahead and adding all symbols to the collating symbols map
|
|
void Def::preprocess_collation_definitions()
|
|
{
|
|
// start with collation values from 2
|
|
unsigned int coll_value = 2;
|
|
|
|
// previous_elm and next_elm are used for processing ellipsis.
|
|
std::string prev_elm;
|
|
std::string next_elm;
|
|
|
|
static unsigned int max_orders = 0;
|
|
|
|
token_list_t::iterator cs_it = sym_list_.begin ();
|
|
for (; cs_it != sym_list_.end (); ++cs_it) {
|
|
cs_map_iter csm_it = cs_map_.find (cs_it->name);
|
|
if (csm_it != cs_map_.end ())
|
|
csm_it->second = coll_value++;
|
|
}
|
|
|
|
std::list<collate_section_t>::iterator s_it = section_list_.begin ();
|
|
|
|
if (s_it != section_list_.end()) {
|
|
// get maximum weights count
|
|
max_orders = s_it->order.size ();
|
|
}
|
|
|
|
for (; s_it != section_list_.end (); ++s_it) {
|
|
if (max_orders != s_it->order.size ())
|
|
issue_diag (E_COLNUM, true, &*s_it->order.begin (),
|
|
"number of collation orders "
|
|
"different for this section\n");
|
|
}
|
|
|
|
const std::map<std::string, wchar_t>& w_cmap = charmap_.get_w_cmap ();
|
|
|
|
// preprocess sections' collation definitions
|
|
for (s_it = section_list_.begin ();
|
|
s_it != section_list_.end (); ++s_it) {
|
|
|
|
// process the order statement and get the weight
|
|
unsigned int order = process_order_stmt (*s_it);
|
|
|
|
std::list<collate_entry_t>::iterator e_it =
|
|
s_it->entries.begin ();
|
|
|
|
for (; e_it != s_it->entries.end (); ++e_it) {
|
|
switch (e_it->first.token) {
|
|
case Scanner::tok_sym_name:
|
|
// store the previous name
|
|
prev_elm = e_it->first.name;
|
|
|
|
// process each of the collation definitions
|
|
process_collation_definition (false, *e_it,
|
|
coll_value++, order);
|
|
|
|
break;
|
|
|
|
case Scanner::tok_abs_ellipsis: // "<FOO>...<BAR>"
|
|
case Scanner::tok_hex_ellipsis: // "<FOO>..<BAR>"
|
|
case Scanner::tok_dec_ellipsis: // "<FOO>....<BAR>"
|
|
{
|
|
|
|
if (++e_it != s_it->entries.end ())
|
|
next_elm = e_it->first.name;
|
|
else {
|
|
issue_diag (E_SYNTAX, true, &e_it->first,
|
|
"unexpected end of collation section while"
|
|
" processing ellipsis during scan_ahead\n");
|
|
}
|
|
|
|
if (e_it->first.token == Scanner::tok_abs_ellipsis) {
|
|
wchar_t w_cmap_cur_val
|
|
= w_cmap.find(prev_elm)->second;
|
|
wchar_t w_cmap_end_val
|
|
= w_cmap.find(next_elm)->second;
|
|
|
|
// the first value has already been
|
|
// added so don't add it again
|
|
w_cmap_cur_val = charmap_.increment_wchar (w_cmap_cur_val);
|
|
while (w_cmap_cur_val != w_cmap_end_val) {
|
|
// process iteration
|
|
collate_info_t ci = {UINT_MAX, UINT_MAX, 0, 0};
|
|
ci.coll_val = coll_value++;
|
|
ci.order = order;
|
|
|
|
coll_map_.insert (std::make_pair (w_cmap_cur_val, ci));
|
|
w_cmap_cur_val =
|
|
charmap_.increment_wchar (w_cmap_cur_val);
|
|
}
|
|
|
|
// add last element "next_elm" to array
|
|
collate_info_t ci = {UINT_MAX, UINT_MAX, 0, 0};
|
|
ci.coll_val = coll_value++;
|
|
ci.order = order;
|
|
coll_map_.insert (std::make_pair (w_cmap_cur_val, ci));
|
|
|
|
}
|
|
else {
|
|
// we are incrementing the symbolic names
|
|
std::string next_name = prev_elm;
|
|
do {
|
|
if (e_it->first.token == Scanner::tok_hex_ellipsis)
|
|
next_name = hex_increment (next_name);
|
|
else
|
|
next_name = dec_increment (next_name);
|
|
|
|
w_cmap_iter it = w_cmap.find (next_name);
|
|
if (it != w_cmap.end()) {
|
|
// process iteration
|
|
collate_info_t ci = {UINT_MAX, UINT_MAX, 0, 0};
|
|
ci.coll_val = coll_value++;
|
|
ci.order = order;
|
|
coll_map_.insert (
|
|
std::make_pair (it->second, ci));
|
|
}
|
|
} while (next_name != next_elm);
|
|
}
|
|
|
|
prev_elm = next_elm;
|
|
break;
|
|
}
|
|
|
|
case Scanner::tok_dbl_ellipsis:
|
|
issue_diag (W_NOTSUP, true, &e_it->first,
|
|
"ellipsis not supported"
|
|
" processing ellipsis during scan_ahead\n");
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
void Def::process_collation_definition ( bool do_weights,
|
|
collate_entry_t& entry,
|
|
unsigned int coll_value,
|
|
unsigned int order)
|
|
{
|
|
const std::map<std::string, wchar_t>& w_cmap = charmap_.get_w_cmap ();
|
|
|
|
// iterators
|
|
w_cmap_iter w_cmap_pos =
|
|
w_cmap.find(entry.first.name);
|
|
ce_map_iter ce_map_pos =
|
|
ce_map_.find(entry.first.name);
|
|
|
|
// look up the symbolic name in the wide character map
|
|
if (w_cmap_pos != w_cmap.end()) {
|
|
|
|
wchar_t wval = w_cmap_pos->second;
|
|
coll_map_iter coll_map_pos = coll_map_.find (wval);
|
|
|
|
coll_map_pos->second.coll_val = coll_value;
|
|
coll_map_pos->second.order = order;
|
|
|
|
// process the weights
|
|
if (do_weights) {
|
|
unsigned int offset = next_offset_++;
|
|
coll_map_pos->second.offset = offset;
|
|
off_mapr_.insert(std::make_pair(offset, entry.first.name));
|
|
|
|
process_weights (entry);
|
|
}
|
|
|
|
}
|
|
else if (ce_map_pos != ce_map_.end()) {
|
|
|
|
ce_map_pos->second.coll_val = coll_value;
|
|
ce_map_pos->second.order = order;
|
|
|
|
// process the weights
|
|
if (do_weights) {
|
|
unsigned int offset = next_offset_++;
|
|
ce_map_pos->second.offset = offset;
|
|
off_mapr_.insert(std::make_pair(offset, entry.first.name));
|
|
|
|
process_weights (entry);
|
|
}
|
|
|
|
}
|
|
else if (cs_map_.find(entry.first.name) != cs_map_.end()) {
|
|
|
|
cs_map_.find(entry.first.name)->second = coll_value;
|
|
|
|
}
|
|
else {
|
|
// the symbol is missing from all three maps;
|
|
// add it in cs map as "sym <-> collating_value" pair
|
|
cs_map_.insert (std::make_pair(entry.first.name, coll_value));
|
|
|
|
warnings_occurred_ =
|
|
issue_diag (W_COLSYM, false, &entry.first,
|
|
"unknown symbol name %s found in "
|
|
"LC_COLLATE definition\n",
|
|
entry.first.name.c_str()) || warnings_occurred_;
|
|
}
|
|
}
|
|
|
|
void Def::process_order(collate_section_t& section,
|
|
unsigned int& coll_value)
|
|
{
|
|
std::string sym;
|
|
|
|
// get the orders
|
|
unsigned int section_order = process_order_stmt (section);
|
|
|
|
// current element
|
|
std::string curr_elm;
|
|
|
|
// previous_elm and next_elm are used for processing ellipsis.
|
|
std::string prev_elm;
|
|
std::string next_elm;
|
|
|
|
const std::map<std::string, wchar_t>& w_cmap = charmap_.get_w_cmap ();
|
|
|
|
// iterate thru section's entries
|
|
std::list<collate_entry_t>::iterator e_it = section.entries.begin ();
|
|
for (; e_it != section.entries.end (); ++e_it) {
|
|
if (e_it->first.token == Scanner::tok_sym_name) {
|
|
prev_elm = e_it->first.name;
|
|
process_collation_definition (true, *e_it,
|
|
coll_value++, section_order);
|
|
|
|
}
|
|
else if ( e_it->first.token == Scanner::tok_abs_ellipsis
|
|
|| e_it->first.token == Scanner::tok_dec_ellipsis
|
|
|| e_it->first.token == Scanner::tok_hex_ellipsis) {
|
|
|
|
// process line with ellipsis
|
|
Weights_t* weights_template =
|
|
new Weights_t[collate_out_.num_weights];
|
|
std::vector<bool> ordinal_weights;
|
|
|
|
token_list_t::iterator w_it = e_it->second.begin ();
|
|
for (int i = 0; i < collate_out_.num_weights
|
|
&& w_it != e_it->second.end (); ++i, ++w_it) {
|
|
ordinal_weights.push_back (
|
|
get_weight (*w_it,weights_template, i));
|
|
}
|
|
while (ordinal_weights.size() < collate_out_.num_weights)
|
|
ordinal_weights.push_back (true);
|
|
|
|
// next line
|
|
if (++e_it != section.entries.end ())
|
|
next_elm = e_it->first.name;
|
|
else {
|
|
issue_diag (E_SYNTAX, true, &e_it->first,
|
|
"unexpected end of collation section while"
|
|
" processing ellipsis\n");
|
|
}
|
|
|
|
if (e_it->first.token == Scanner::tok_abs_ellipsis) {
|
|
wchar_t w_cmap_cur_val
|
|
= w_cmap.find(prev_elm)->second;
|
|
wchar_t w_cmap_end_val
|
|
= w_cmap.find(next_elm)->second;
|
|
|
|
// the first value has already been added so don't add it again
|
|
w_cmap_cur_val = charmap_.increment_wchar (w_cmap_cur_val);
|
|
|
|
while (w_cmap_cur_val != w_cmap_end_val) {
|
|
|
|
add_to_coll (w_cmap_cur_val, weights_template,
|
|
coll_value++, ordinal_weights, false);
|
|
|
|
w_cmap_cur_val = charmap_.increment_wchar (w_cmap_cur_val);
|
|
}
|
|
|
|
// add the end element to the collation array.
|
|
add_to_coll (w_cmap_end_val, weights_template, coll_value++,
|
|
ordinal_weights, false);
|
|
|
|
} else {
|
|
|
|
// we are incrementing the symbolic names
|
|
std::string next_name = prev_elm;
|
|
do {
|
|
if (e_it->first.token == Scanner::tok_hex_ellipsis)
|
|
next_name = hex_increment (next_name);
|
|
else
|
|
next_name = dec_increment (next_name);
|
|
|
|
w_cmap_iter it = w_cmap.find (next_name);
|
|
if (it != w_cmap.end()) {
|
|
add_to_coll (it->second, weights_template,
|
|
coll_value++, ordinal_weights, false);
|
|
}
|
|
} while (next_name != next_elm);
|
|
}
|
|
|
|
// store current left-hand symbol
|
|
next_elm = prev_elm;
|
|
|
|
delete[] weights_template;
|
|
|
|
} else if (e_it->first.token == Scanner::tok_undefined) {
|
|
|
|
// add all characters that were not explicitely given a collation
|
|
// value in increasing order based on their encoded values
|
|
undefined_keyword_found_ = true;
|
|
collate_out_.undefined_optimization = true;
|
|
undef_char_info_.order = section_order;
|
|
Weights_t* weights_template
|
|
= new Weights_t[collate_out_.num_weights];
|
|
std::vector<bool> ordinal_weights;
|
|
|
|
token_list_t::iterator w_it = e_it->second.begin ();
|
|
for (int i = 0; i < collate_out_.num_weights
|
|
&& w_it != e_it->second.end(); ++i, ++w_it) {
|
|
|
|
if (w_it->token == Scanner::tok_abs_ellipsis)
|
|
collate_out_.undefined_optimization = false;
|
|
|
|
ordinal_weights.push_back (
|
|
!get_weight (*w_it, weights_template, i));
|
|
}
|
|
|
|
while (ordinal_weights.size() < collate_out_.num_weights)
|
|
ordinal_weights.push_back (false);
|
|
|
|
add_missing_values (ordinal_weights, weights_template,
|
|
coll_value, false);
|
|
|
|
delete[] weights_template;
|
|
}
|
|
}
|
|
}
|
|
|
|
void Def::add_missing_values (const std::vector<bool> &ordinal_weights,
|
|
const Weights_t* weights_template,
|
|
unsigned int &coll_value, bool give_warning)
|
|
{
|
|
const std::map<std::string, wchar_t>& w_cmap = charmap_.get_w_cmap ();
|
|
|
|
// we want to print a warning message once if there are characters that
|
|
// were not given collation values.
|
|
bool warning_issued = false;
|
|
|
|
symnames_list_iter symnames_it;
|
|
for (symnames_it = charmap_.get_symnames_list ().begin ();
|
|
symnames_it != charmap_.get_symnames_list ().end ();
|
|
++symnames_it) {
|
|
|
|
std::map<std::string, wchar_t>::const_iterator w_cmap_it;
|
|
if ((w_cmap_it = w_cmap.find(*symnames_it)) != w_cmap.end()) {
|
|
wchar_t wchar_val = (*w_cmap_it).second;
|
|
|
|
coll_map_iter coll_map_it;
|
|
if ((coll_map_it = coll_map_.find(wchar_val)) != coll_map_.end()) {
|
|
if (coll_map_it->second.offset == UINT_MAX) {
|
|
if (give_warning && !warning_issued) {
|
|
warning_issued = true;
|
|
warnings_occurred_ =
|
|
issue_diag (W_MISSING, false,
|
|
0, "some characters in the codeset "
|
|
"were not explicitly given a "
|
|
"collation value\n")
|
|
|| warnings_occurred_;
|
|
}
|
|
if (!collate_out_.undefined_optimization) {
|
|
add_to_coll (wchar_val, weights_template,
|
|
coll_value++, ordinal_weights, true);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
if (collate_out_.undefined_optimization){
|
|
collate_out_.undefined_weight_idx = next_offset_++;
|
|
undef_char_info_.offset = collate_out_.undefined_weight_idx;
|
|
undef_char_info_.coll_val = coll_value++;
|
|
// initialize the weight template with the undefined collation info
|
|
Weights_t* weights = new Weights_t[collate_out_.num_weights];
|
|
|
|
if (0 != weights_template) {
|
|
for (int k = 0; k < collate_out_.num_weights; ++k) {
|
|
weights[k].size = weights_template[k].size;
|
|
for (int j = 0; j < 256; ++j)
|
|
weights[k].weight[j] = weights_template[k].weight[j];
|
|
}
|
|
}
|
|
|
|
|
|
std::vector<bool>::const_iterator it;
|
|
|
|
// now go through the vector that tells us if a particular
|
|
// weight is based on the ordinal position of the element
|
|
// and place the correct ordinal value in the weight vector
|
|
// if it is.
|
|
int i = 0;
|
|
for (it = ordinal_weights.begin();
|
|
it != ordinal_weights.end(); ++it, ++i)
|
|
{
|
|
// FIXME: this may not be correct
|
|
if (!*it){
|
|
weights[i].size = 1;
|
|
weights[i].weight[0] = undef_char_info_.coll_val;
|
|
}
|
|
}
|
|
undef_char_info_.weights = weights;
|
|
|
|
|
|
}
|
|
}
|
|
|
|
void Def::add_to_coll (const wchar_t w_cmap_cur_val,
|
|
const Weights_t* weights_template,
|
|
const unsigned int coll_value,
|
|
const std::vector<bool>& ordinal_weights,
|
|
bool undefined_value)
|
|
{
|
|
Weights_t* weights = new Weights_t[collate_out_.num_weights];
|
|
|
|
if (0 != weights_template) {
|
|
for (int k = 0; k < collate_out_.num_weights; ++k) {
|
|
weights[k].size = weights_template[k].size;
|
|
for (int j = 0; j < 256; ++j)
|
|
weights[k].weight[j] = weights_template[k].weight[j];
|
|
}
|
|
}
|
|
|
|
// get the symbolic name from the value
|
|
std::string name
|
|
= charmap_.get_rw_cmap().find (w_cmap_cur_val)->second;
|
|
coll_map_.find (w_cmap_cur_val)->second.coll_val = coll_value;
|
|
|
|
unsigned int offset;
|
|
if (undefined_value && collate_out_.undefined_optimization)
|
|
offset = next_offset_;
|
|
else
|
|
offset = next_offset_++;
|
|
|
|
const std::map<std::string, wchar_t>& w_cmap = charmap_.get_w_cmap ();
|
|
|
|
w_cmap_iter w_cmap_pos = w_cmap.find(name);
|
|
coll_map_iter coll_it = coll_map_.find (w_cmap_pos->second);
|
|
coll_it->second.offset = offset;
|
|
|
|
off_mapr_.insert(std::make_pair(offset, name));
|
|
|
|
std::vector<bool>::const_iterator it;
|
|
|
|
// now go through the vector that tells us if a particular
|
|
// weight is based on the ordinal position of the element
|
|
// and place the correct ordinal value in the weight vector
|
|
// if it is.
|
|
int i = 0;
|
|
for (it = ordinal_weights.begin();
|
|
it != ordinal_weights.end(); ++it, ++i)
|
|
{
|
|
if (*it){
|
|
weights[i].size = 1;
|
|
weights[i].weight[0] = coll_value;
|
|
}
|
|
}
|
|
coll_it->second.weights = weights;
|
|
}
|
|
|
|
// if undefined optimization is on then only those characters that have
|
|
// been defined should go into the valid set, otherwise all characters go
|
|
// into the set.
|
|
void Def::gen_valid_coll_mb_set ()
|
|
{
|
|
if (!valid_coll_mb_set_.empty())
|
|
return;
|
|
|
|
for (coll_map_iter coll_it = coll_map_.begin();
|
|
coll_it != coll_map_.end(); ++coll_it) {
|
|
if (coll_it->second.offset != UINT_MAX
|
|
|| !collate_out_.undefined_optimization){
|
|
rmb_cmap_iter rn_cmap_it
|
|
= charmap_.get_rmb_cmap().find (coll_it->first);
|
|
std::string valid = rn_cmap_it->second.substr
|
|
(0, rn_cmap_it->second.size() - 1);
|
|
|
|
while (valid.size() > 0){
|
|
valid_coll_mb_set_.insert (valid);
|
|
valid = valid.substr(0, valid.size() - 1);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
// In processing the collate section of the locale definition file
|
|
// we will filter out the characters that are not present in the
|
|
// charmap file; these characters when encountered on the left-hand side
|
|
// of a weight definition for a symbolic name will be left in place
|
|
// and used as a mere collating symbol for the rest of the parsing -
|
|
// i.e. its weights ignored and assigned the weight corresponding to
|
|
// its position in the collate section.
|
|
void Def::process_collate()
|
|
{
|
|
issue_diag (I_STAGE, false, 0, "processing LC_COLLATE section\n");
|
|
|
|
// update flags
|
|
collate_def_found_ = true;
|
|
// initialization
|
|
init_coll_map ();
|
|
|
|
// first preprocess the collate section
|
|
preprocess_collate ();
|
|
|
|
const std::map<std::string, wchar_t>& w_cmap = charmap_.get_w_cmap ();
|
|
|
|
// process the collating symbols list
|
|
token_list_t::iterator cs_it = cs_list_.begin ();
|
|
for (;cs_it != cs_list_.end (); ++cs_it) {
|
|
if (w_cmap.find (cs_it->name) !=
|
|
w_cmap.end())
|
|
issue_diag (E_SYNTAX, true, &*cs_it,
|
|
"collating-symbol %s found in character map\n",
|
|
cs_it->name.c_str ());
|
|
else if (ce_map_.find (cs_it->name) != ce_map_.end())
|
|
issue_diag (E_SYNTAX, true, &*cs_it,
|
|
"collating-symbol %s has already been defined as "
|
|
"a collating-element\n", cs_it->name.c_str ());
|
|
|
|
cs_map_.insert (std::make_pair (cs_it->name, 0));
|
|
}
|
|
|
|
// process the collating elements list
|
|
std::list<collate_elem_t>::iterator ce_it =
|
|
ce_list_.begin ();
|
|
for (; ce_it != ce_list_.end (); ++ce_it) {
|
|
if (w_cmap.find (ce_it->first.name) !=
|
|
w_cmap.end()) {
|
|
issue_diag (E_SYNTAX, true, &ce_it->first,
|
|
"collating element %s found in charmap\n",
|
|
ce_it->first.name.c_str());
|
|
}
|
|
|
|
// collating element info to be stored
|
|
ce_info_t ce_tmp;
|
|
ce_tmp.offset = UINT_MAX;
|
|
ce_tmp.coll_val = UINT_MAX;
|
|
ce_tmp.order = 0;
|
|
ce_tmp.weights = 0;
|
|
|
|
// array of symbolic names
|
|
std::vector<std::string> ce_sym_array;
|
|
|
|
// get the definition of this collating element
|
|
token_list_t& ce_def_list = ce_it->second;
|
|
token_list_t::iterator ce_def_it = ce_def_list.begin ();
|
|
bool invalid = false;
|
|
|
|
if (ce_def_it->token == Scanner::tok_string) {
|
|
if ((ce_tmp.ce_wstr = convert_wstring (*ce_def_it)).empty())
|
|
invalid = true;
|
|
} else if (ce_def_it->token == Scanner::tok_sym_name) {
|
|
for (; ce_def_it != ce_def_list.end (); ++ce_def_it)
|
|
ce_sym_array.push_back (ce_def_it->name);
|
|
if ((ce_tmp.ce_wstr = convert_wstring (ce_sym_array)).empty())
|
|
invalid = true;
|
|
} else {
|
|
issue_diag (E_SYNTAX, true, &*ce_def_it,
|
|
"illegal collating-element expression: %s\n",
|
|
ce_def_it->name.c_str ());
|
|
}
|
|
if (invalid)
|
|
issue_diag (W_SYNTAX, false, &*ce_def_it,
|
|
"character in collating element definition "
|
|
"not found in character map\n");
|
|
|
|
if (!invalid) {
|
|
ce_map_.insert (std::make_pair(ce_it->first.name, ce_tmp));
|
|
|
|
std::string ce_str;
|
|
std::string ce_utf8_wstr;
|
|
for (unsigned int i = 0; i < ce_tmp.ce_wstr.size(); ++i){
|
|
ce_str += convert_to_ext(ce_tmp.ce_wstr[i]);
|
|
ce_utf8_wstr += utf8_encode (ce_tmp.ce_wstr[i]);
|
|
}
|
|
ce_sym_map_.insert (std::make_pair(ce_str, ce_it->first.name));
|
|
ce_wsym_map_.insert (std::make_pair (ce_utf8_wstr,
|
|
ce_it->first.name));
|
|
}
|
|
}
|
|
|
|
// preprocess the collation definitions and make known all
|
|
// the symbolic names available
|
|
preprocess_collation_definitions();
|
|
|
|
// collation values 0 and 1 are reserved
|
|
unsigned int coll_value = 2;
|
|
|
|
// FIXME: assign values to the collating-elements AGAIN
|
|
cs_it = sym_list_.begin ();
|
|
for (; cs_it != sym_list_.end (); ++cs_it) {
|
|
cs_map_iter csm_it = cs_map_.find (cs_it->name);
|
|
if (csm_it != cs_map_.end ())
|
|
csm_it->second = coll_value++;
|
|
}
|
|
|
|
// process the sections
|
|
std::list<collate_section_t>::iterator sect_it =
|
|
section_list_.begin ();
|
|
for (; sect_it != section_list_.end (); ++sect_it)
|
|
process_order (*sect_it, coll_value);
|
|
|
|
// check to make sure that all the elements in the codeset were added
|
|
if (!undefined_keyword_found_) {
|
|
std::vector<bool> ordinal_weights;
|
|
for (int i = 0; i < collate_out_.num_weights; ++i)
|
|
ordinal_weights.push_back (false);
|
|
collate_out_.undefined_optimization = true;
|
|
add_missing_values (ordinal_weights, 0, coll_value, true);
|
|
}
|
|
|
|
}
|
|
|
|
|
|
void Def::gen_n_to_w_coll_tables (const std::string &charp,
|
|
unsigned int tab_num)
|
|
{
|
|
gen_valid_coll_mb_set();
|
|
offset_tab_t tab;
|
|
for (unsigned int k = 0; k <= UCHAR_MAX; ++k) {
|
|
tab.off[k] = UINT_MAX;
|
|
}
|
|
|
|
tab.first_offset = -1;
|
|
mb_cmap_iter n_cmap_it;
|
|
|
|
for (unsigned int i = 0; i <= UCHAR_MAX; ++i){
|
|
unsigned char cur_char = (unsigned char)i;
|
|
std::string mb_char = charp;
|
|
mb_char += (char)cur_char;
|
|
if ((n_cmap_it = charmap_.get_mb_cmap().find (mb_char))
|
|
!= charmap_.get_mb_cmap().end()) {
|
|
coll_map_iter coll_map_it = coll_map_.find (n_cmap_it->second);
|
|
if (coll_map_it->second.offset != UINT_MAX
|
|
|| !collate_out_.undefined_optimization) {
|
|
tab.off[cur_char] = coll_map_it->second.offset;
|
|
if (tab.first_offset == -1)
|
|
tab.first_offset = cur_char;
|
|
}
|
|
}
|
|
else {
|
|
if (valid_coll_mb_set_.find (mb_char) != valid_coll_mb_set_.end()){
|
|
++next_tab_num_;
|
|
tab.off[cur_char] = next_tab_num_ | 0x80000000;
|
|
gen_n_to_w_coll_tables (mb_char, next_tab_num_);
|
|
if (tab.first_offset == -1)
|
|
tab.first_offset = cur_char;
|
|
|
|
}
|
|
else {
|
|
tab.off[cur_char] = UINT_MAX;
|
|
}
|
|
}
|
|
}
|
|
char_offs_.insert (std::make_pair (tab_num, tab));
|
|
}
|
|
|
|
|
|
void Def::gen_w_to_n_coll_tables (const std::string &charp,
|
|
unsigned int tab_num)
|
|
{
|
|
// initialize a table used in ctype and collate
|
|
create_wchar_utf8_table();
|
|
gen_valid_coll_wchar_set();
|
|
|
|
offset_tab_t tab;
|
|
for (unsigned int k = 0; k <= UCHAR_MAX; ++k)
|
|
tab.off[k] = UINT_MAX;
|
|
|
|
tab.first_offset = -1;
|
|
wchar_utf8_iter wu_it;
|
|
mb_cmap_iter n_cmap_it;
|
|
|
|
for (unsigned int i = 0; i <= UCHAR_MAX; ++i) {
|
|
unsigned char cur_char = (unsigned char)i;
|
|
std::string mb_char = (charp);
|
|
|
|
mb_char += (char)cur_char;
|
|
|
|
wu_it = wchar_utf8_to_ext_.find (mb_char);
|
|
if (wu_it != wchar_utf8_to_ext_.end()) {
|
|
n_cmap_it = charmap_.get_mb_cmap().find (wu_it->second);
|
|
if (n_cmap_it != charmap_.get_mb_cmap().end ()) {
|
|
|
|
coll_map_iter coll_map_it = coll_map_.find (n_cmap_it->second);
|
|
if (coll_map_it->second.offset != UINT_MAX ||
|
|
!collate_out_.undefined_optimization) {
|
|
tab.off[cur_char] = coll_map_it->second.offset;
|
|
if (tab.first_offset == -1)
|
|
tab.first_offset = cur_char;
|
|
}
|
|
}
|
|
}
|
|
else
|
|
if (valid_coll_wchar_set_.find (mb_char) !=
|
|
valid_coll_wchar_set_.end()) {
|
|
|
|
++next_wchar_coll_tab_num_;
|
|
tab.off[cur_char] = next_wchar_coll_tab_num_ | 0x80000000;
|
|
gen_w_to_n_coll_tables (mb_char,
|
|
next_wchar_coll_tab_num_);
|
|
if (tab.first_offset == -1)
|
|
tab.first_offset = cur_char;
|
|
}
|
|
else
|
|
tab.off[cur_char] = UINT_MAX;
|
|
}
|
|
|
|
w_to_n_coll_.insert (std::make_pair (tab_num, tab));
|
|
}
|
|
|
|
|
|
void Def::gen_n_ce_tables (const std::set<std::string>ce_elms,
|
|
unsigned int idx, unsigned int tab_num)
|
|
{
|
|
if (ce_elms.size() > 0) {
|
|
ce_offset_tab_t tab;
|
|
for (unsigned int k = 0; k <= UCHAR_MAX; ++k) {
|
|
tab.off[k] = UINT_MAX;
|
|
}
|
|
tab.first_offset = -1;
|
|
tab.last_offset = 0;
|
|
ce_sym_map_iter ce_sym_map_it;
|
|
ce_map_iter ce_map_it;
|
|
|
|
std::set<std::string>::const_iterator ce_elms_it;
|
|
std::set<std::string> next_elms;
|
|
|
|
for (unsigned int i = 0; i <= UCHAR_MAX; ++i) {
|
|
next_elms.clear();
|
|
for (ce_elms_it = ce_elms.begin(); ce_elms_it != ce_elms.end();
|
|
ce_elms_it ++) {
|
|
|
|
if ((unsigned char)i == (unsigned char)((*ce_elms_it)[idx])) {
|
|
if ((*ce_elms_it).size() == idx + 1) {
|
|
if ((ce_sym_map_it = ce_sym_map_.find (*ce_elms_it))
|
|
!= ce_sym_map_.end()) {
|
|
ce_map_it = ce_map_.find(ce_sym_map_it->second);
|
|
tab.off[i] = ce_map_it->second.offset;
|
|
if (tab.first_offset == -1)
|
|
tab.first_offset = i;
|
|
if ((unsigned int)tab.last_offset < i)
|
|
tab.last_offset = i;
|
|
}
|
|
}
|
|
else {
|
|
|
|
next_elms.insert (*ce_elms_it);
|
|
if (tab.off[i] == UINT_MAX) {
|
|
++ next_n_ce_tab_num_;
|
|
tab.off[i] = next_n_ce_tab_num_ | 0x80000000;
|
|
}
|
|
if (tab.first_offset == -1)
|
|
tab.first_offset = i;
|
|
if ((unsigned int)tab.last_offset < i)
|
|
tab.last_offset = i;
|
|
}
|
|
}
|
|
}
|
|
// now recursively call gen_n_ce_tables if any collating
|
|
// elements with this character were found;
|
|
if (next_elms.size() != 0)
|
|
gen_n_ce_tables (next_elms, idx + 1, next_n_ce_tab_num_);
|
|
}
|
|
n_ce_offs_.insert(std::make_pair(tab_num, tab));
|
|
|
|
}
|
|
}
|
|
|
|
|
|
void Def::gen_w_ce_tables (const std::set<std::string>ce_elms,
|
|
unsigned int idx, unsigned int tab_num)
|
|
{
|
|
if (ce_elms.size() > 0) {
|
|
ce_offset_tab_t tab;
|
|
for (unsigned int k = 0; k <= UCHAR_MAX; ++k) {
|
|
tab.off[k] = UINT_MAX;
|
|
}
|
|
tab.first_offset = -1;
|
|
tab.last_offset = 0;
|
|
ce_sym_map_iter ce_sym_map_it;
|
|
ce_map_iter ce_map_it;
|
|
|
|
std::set<std::string>::const_iterator ce_elms_it;
|
|
std::set<std::string> next_elms;
|
|
|
|
for (unsigned int i = 0; i <= UCHAR_MAX; ++i) {
|
|
next_elms.clear();
|
|
for (ce_elms_it = ce_elms.begin(); ce_elms_it != ce_elms.end();
|
|
ce_elms_it ++) {
|
|
|
|
if ((unsigned char)i == (unsigned char)(*ce_elms_it)[idx]) {
|
|
if ((*ce_elms_it).size() == idx + 1) {
|
|
if ((ce_sym_map_it = ce_wsym_map_.find (*ce_elms_it))
|
|
!= ce_wsym_map_.end()) {
|
|
ce_map_it = ce_map_.find(ce_sym_map_it->second);
|
|
tab.off[i] = ce_map_it->second.offset;
|
|
if (tab.first_offset == -1)
|
|
tab.first_offset = i;
|
|
if ((unsigned int)tab.last_offset < i)
|
|
tab.last_offset = i;
|
|
}
|
|
}
|
|
else {
|
|
next_elms.insert (*ce_elms_it);
|
|
if (tab.off[i] == UINT_MAX) {
|
|
++ next_w_ce_tab_num_;
|
|
tab.off[i] = next_w_ce_tab_num_ | 0x80000000;
|
|
}
|
|
if (tab.first_offset == -1)
|
|
tab.first_offset = i;
|
|
if ((unsigned int)tab.last_offset < i)
|
|
tab.last_offset = i;
|
|
}
|
|
}
|
|
}
|
|
// now recursively call gen_w_ce_tables if any collating
|
|
// elements with this character were found;
|
|
if (next_elms.size() != 0)
|
|
gen_w_ce_tables (next_elms, idx + 1, next_w_ce_tab_num_);
|
|
}
|
|
w_ce_offs_.insert(std::make_pair(tab_num, tab));
|
|
}
|
|
}
|
|
|
|
|
|
void Def::dump_collate ()
|
|
{
|
|
std::cout << "LC_COLLATE\n";
|
|
|
|
const std::map<std::string, wchar_t>& w_cmap = charmap_.get_w_cmap ();
|
|
|
|
token_list_t::iterator sl_it = sym_list_.begin ();
|
|
for (; sl_it != sym_list_.end (); ++sl_it) {
|
|
std::cout << sl_it->name << " % ";
|
|
cs_map_iter cs_it = cs_map_.find (sl_it->name);
|
|
if (cs_it != cs_map_.end ()) {
|
|
std::cout << cs_it->second << '\n';
|
|
continue;
|
|
}
|
|
|
|
ce_map_iter ce_it = ce_map_.find (sl_it->name);
|
|
if (ce_it != ce_map_.end ()) {
|
|
std::cout << ce_it->second.coll_val << '\n';
|
|
continue;
|
|
}
|
|
|
|
std::map<std::string, wchar_t >::const_iterator cw_it =
|
|
w_cmap.find (sl_it->name);
|
|
if (cw_it != w_cmap.end ()) {
|
|
coll_map_iter cm_it =
|
|
coll_map_.find (cw_it->second);
|
|
if (cm_it != coll_map_.end ()) {
|
|
std::cout << cm_it->second.coll_val << '\n';
|
|
continue;
|
|
}
|
|
}
|
|
std::cout << '\n';
|
|
}
|
|
|
|
std::cout << "\n\n";
|
|
|
|
std::list<collate_section_t>::iterator sect_it =
|
|
section_list_.begin ();
|
|
while (sect_it != section_list_.end ()) {
|
|
|
|
// dump the orders
|
|
std::cout << "order_start " << sect_it->name;
|
|
token_list_t::iterator o_it = sect_it->order.begin ();
|
|
for (; o_it != sect_it->order.end (); ++o_it)
|
|
std::cout << ";" << o_it->name;
|
|
std::cout << '\n';
|
|
|
|
// for each entry in the entries list
|
|
collate_entry_list_t::iterator e_it = sect_it->entries.begin ();
|
|
for (; e_it != sect_it->entries.end (); ++e_it) {
|
|
|
|
// dump the collation definition (weights included)
|
|
std::cout << e_it->first.name << " ";
|
|
if (!e_it->second.empty ()) {
|
|
token_list_t::iterator w_it = e_it->second.begin ();
|
|
std::cout << (w_it++)->name;
|
|
for (; w_it != e_it->second.end (); ++w_it)
|
|
std::cout << ";" << w_it->name;
|
|
}
|
|
|
|
// lookup the value associated with this collation symbol
|
|
cs_map_iter cs_it = cs_map_.find (e_it->first.name);
|
|
if (cs_it != cs_map_.end ()) {
|
|
std::cout << " % " << cs_it->second << '\n';
|
|
continue;
|
|
}
|
|
|
|
ce_map_iter ce_it = ce_map_.find (e_it->first.name);
|
|
if (ce_it != ce_map_.end ()) {
|
|
std::cout << " % " << ce_it->second.coll_val << '\n';
|
|
continue;
|
|
}
|
|
|
|
std::map<std::string, wchar_t >::const_iterator cw_it =
|
|
w_cmap.find (e_it->first.name);
|
|
if (cw_it != w_cmap.end ()) {
|
|
coll_map_iter cm_it =
|
|
coll_map_.find (cw_it->second);
|
|
if (cm_it != coll_map_.end ()) {
|
|
std::cout << " % " << cm_it->second.coll_val << '\n';
|
|
continue;
|
|
}
|
|
|
|
}
|
|
std::cout << '\n';
|
|
}
|
|
|
|
std::cout << "order_end\n";
|
|
++sect_it;
|
|
}
|
|
std::cout << "END LC_COLLATE\n";
|
|
}
|
|
|
|
|
|
void Def::write_collate (std::string dir_name)
|
|
{
|
|
assert (!dir_name.empty());
|
|
|
|
static const char lc_name[] = "LC_COLLATE";
|
|
|
|
if (collate_written_)
|
|
return;
|
|
|
|
if (!collate_def_found_) {
|
|
issue_diag (I_SKIP, false, 0,
|
|
"%s section not found, skipping\n", lc_name);
|
|
return;
|
|
}
|
|
|
|
next_tab_num_ = 0;
|
|
next_wchar_coll_tab_num_ = 0;
|
|
std::set<std::string> ce_strs;
|
|
ce_sym_map_iter it;
|
|
ce_map_iter ce_mit;
|
|
for (it = ce_sym_map_.begin (); it != ce_sym_map_.end (); ++it) {
|
|
if ((ce_mit = ce_map_.find (it->second))->second.coll_val
|
|
!= UINT_MAX) {
|
|
// check to see of the largest_ce needs to be changed
|
|
if (ce_mit->second.ce_wstr.size() + 1
|
|
> collate_out_.largest_ce)
|
|
collate_out_.largest_ce = ce_mit->second.ce_wstr.size();
|
|
|
|
ce_strs.insert (it->first);
|
|
}
|
|
}
|
|
next_n_ce_tab_num_ = 0;
|
|
gen_n_ce_tables(ce_strs, 0, 0);
|
|
ce_strs.clear();
|
|
for (it = ce_wsym_map_.begin (); it != ce_wsym_map_.end (); ++it) {
|
|
if (ce_map_.find (it->second)->second.coll_val != UINT_MAX)
|
|
ce_strs.insert (it->first);
|
|
}
|
|
next_w_ce_tab_num_ = 0;
|
|
gen_w_ce_tables (ce_strs, 0, 0);
|
|
|
|
gen_n_to_w_coll_tables ("", next_tab_num_);
|
|
gen_w_to_n_coll_tables ("", next_wchar_coll_tab_num_);
|
|
|
|
(dir_name += _RWSTD_PATH_SEP) += lc_name;
|
|
|
|
issue_diag (I_OPENWR, false, 0, "writing %s\n", dir_name.c_str ());
|
|
|
|
std::ofstream out (dir_name.c_str(), std::ios::binary);
|
|
out.exceptions (std::ios::failbit | std::ios::badbit);
|
|
|
|
unsigned int i;
|
|
|
|
// calculate the size of an individual weight element
|
|
collate_out_.elm_size = collate_out_.num_weights
|
|
* collate_out_.longest_weight * sizeof (unsigned int) +
|
|
sizeof (unsigned int);
|
|
|
|
// the first section of the collate database is the collating
|
|
// element information
|
|
collate_out_.n_ce_tab_off = 0;
|
|
collate_out_.w_ce_tab_off = collate_out_.n_ce_tab_off;
|
|
n_ce_offs_iter n_ce_offs_it;
|
|
for (n_ce_offs_it = n_ce_offs_.begin();
|
|
n_ce_offs_it != n_ce_offs_.end (); ++n_ce_offs_it) {
|
|
collate_out_.w_ce_tab_off += (n_ce_offs_it->second.last_offset
|
|
- n_ce_offs_it->second.first_offset + 1)* sizeof (int);
|
|
}
|
|
|
|
// next comes the weight information
|
|
collate_out_.weight_tab_off = collate_out_.w_ce_tab_off;
|
|
w_ce_offs_iter w_ce_offs_it;
|
|
for (w_ce_offs_it = w_ce_offs_.begin();
|
|
w_ce_offs_it != w_ce_offs_.end(); ++w_ce_offs_it) {
|
|
collate_out_.weight_tab_off += (w_ce_offs_it->second.last_offset
|
|
- w_ce_offs_it->second.first_offset + 1)* sizeof (int);
|
|
}
|
|
|
|
coll_map_iter coll_map_pos;
|
|
|
|
collate_out_.num_elms = off_mapr_.size();
|
|
if (collate_out_.undefined_optimization)
|
|
++collate_out_.num_elms;
|
|
|
|
// now calculate the offset for the first narrow character table
|
|
collate_out_.n_char_tab_off = collate_out_.weight_tab_off
|
|
+ collate_out_.num_elms * collate_out_.elm_size;
|
|
|
|
// now calculate the offset fo the fist wide character table
|
|
// but first we need the size of the narrow tables
|
|
char_offs_iter char_offs_it;
|
|
unsigned int char_offs_size = 0;
|
|
for (char_offs_it = char_offs_.begin();
|
|
char_offs_it != char_offs_.end(); ++char_offs_it) {
|
|
char_offs_size += (UCHAR_MAX + 1
|
|
- char_offs_it->second.first_offset)
|
|
* sizeof (unsigned int);
|
|
}
|
|
|
|
collate_out_.w_char_tab_off = collate_out_.n_char_tab_off
|
|
+ char_offs_size;
|
|
|
|
// now calculate the offset for the narrow character offset table
|
|
// but first we need the size of the wide tables
|
|
w_to_n_coll_iter w_to_n_coll_it;
|
|
unsigned int w_to_n_size = 0;
|
|
for (w_to_n_coll_it = w_to_n_coll_.begin();
|
|
w_to_n_coll_it != w_to_n_coll_.end(); ++w_to_n_coll_it) {
|
|
w_to_n_size += (UCHAR_MAX + 1
|
|
- w_to_n_coll_it->second.first_offset)
|
|
* sizeof (unsigned int);
|
|
}
|
|
|
|
collate_out_.n_char_off_tab_off = collate_out_.w_char_tab_off
|
|
+ w_to_n_size;
|
|
|
|
// now calculate the offset for the wide character offset table
|
|
collate_out_.w_char_off_tab_off = collate_out_.n_char_off_tab_off
|
|
+ char_offs_.size() * sizeof (unsigned int);
|
|
|
|
// calculate the offset for the narrow collating element offset table
|
|
collate_out_.n_ce_off_tab_off = collate_out_.w_char_off_tab_off
|
|
+ w_to_n_coll_.size() * sizeof (unsigned int);
|
|
|
|
// calculate the offset for the wide collating element offset table
|
|
collate_out_.w_ce_off_tab_off = collate_out_.n_ce_off_tab_off
|
|
+ n_ce_offs_.size() * sizeof (unsigned int);
|
|
|
|
// now calculate the offset of the first character information
|
|
collate_out_.n_char_first_char_off = collate_out_.w_ce_off_tab_off
|
|
+ w_ce_offs_.size() * sizeof (unsigned int);
|
|
|
|
// now calculate the offset of the wide table first char info
|
|
collate_out_.w_char_first_char_off = collate_out_.n_char_first_char_off
|
|
+ char_offs_.size() * sizeof (unsigned char);
|
|
|
|
// now calculate the offset of the narrow ce first character info
|
|
collate_out_.n_ce_first_char_off = collate_out_.w_char_first_char_off
|
|
+ w_to_n_coll_.size() * sizeof (unsigned char);
|
|
|
|
// now calculate the offset of the wide ce first character info
|
|
collate_out_.w_ce_first_char_off = collate_out_.n_ce_first_char_off
|
|
+ n_ce_offs_.size() * sizeof (unsigned char);
|
|
|
|
// now calculate the offset of the narrow ce last character info
|
|
collate_out_.n_ce_last_char_off = collate_out_.w_ce_first_char_off
|
|
+ w_ce_offs_.size() * sizeof (unsigned char);
|
|
|
|
// now calculate the offset of the wide ce last character info
|
|
collate_out_.w_ce_last_char_off = collate_out_.n_ce_last_char_off
|
|
+ n_ce_offs_.size() * sizeof (unsigned char);
|
|
|
|
// now calculate the offset of the codeset name
|
|
collate_out_.codeset_off = collate_out_.w_ce_last_char_off
|
|
+ w_ce_offs_.size() * sizeof (unsigned char);
|
|
|
|
// finally calculate the offset of the charmap name
|
|
collate_out_.charmap_off = collate_out_.codeset_off
|
|
+ charmap_.get_code_set_name().size() + 1;
|
|
|
|
|
|
|
|
// print out the collate struct
|
|
out.write ((char*)&collate_out_, sizeof(collate_out_));
|
|
|
|
for (n_ce_offs_it = n_ce_offs_.begin();
|
|
n_ce_offs_it != n_ce_offs_.end(); ++n_ce_offs_it) {
|
|
for (i = (unsigned int)n_ce_offs_it->second.first_offset;
|
|
i <= (unsigned int)n_ce_offs_it->second.last_offset; ++i)
|
|
out.write ((char*)&n_ce_offs_it->second.off[i],
|
|
sizeof (n_ce_offs_it->second.off[i]));
|
|
}
|
|
|
|
for (w_ce_offs_it = w_ce_offs_.begin();
|
|
w_ce_offs_it != w_ce_offs_.end(); ++w_ce_offs_it) {
|
|
for (i = (unsigned int)w_ce_offs_it->second.first_offset;
|
|
i <= (unsigned int)w_ce_offs_it->second.last_offset; ++i)
|
|
out.write ((char*)&w_ce_offs_it->second.off[i],
|
|
sizeof (w_ce_offs_it->second.off[i]));
|
|
}
|
|
|
|
|
|
// now print out the weight array
|
|
unsigned int maxw = UINT_MAX;
|
|
Weights_t* weights;
|
|
bool undefined_written = false;
|
|
|
|
const std::map<std::string, wchar_t>& w_cmap = charmap_.get_w_cmap ();
|
|
|
|
std::size_t off_idx = 0;
|
|
off_mapr_iter current_off = off_mapr_.begin();
|
|
for (; current_off != off_mapr_.end(); ++off_idx) {
|
|
|
|
// check the current offset
|
|
if (current_off->first != off_idx
|
|
&& off_idx != undef_char_info_.offset)
|
|
issue_diag (E_RANGE, true, 0,
|
|
"current_off : %d, index : %d, giving up\n",
|
|
current_off->first, off_idx);
|
|
|
|
if (collate_out_.undefined_optimization
|
|
&& off_idx == undef_char_info_.offset) {
|
|
undefined_written = true;
|
|
weights = undef_char_info_.weights;
|
|
out.write ((char*)&undef_char_info_.order,
|
|
sizeof (undef_char_info_.order));
|
|
for (int j = 0; j < collate_out_.num_weights; ++j) {
|
|
for (int k = 0; k < weights[j].size; ++k) {
|
|
out.write ((char*)&weights[j].weight[k],
|
|
sizeof (weights[j].weight[k]));
|
|
}
|
|
for (int c = weights[j].size;
|
|
c < collate_out_.longest_weight; ++c)
|
|
out.write ((char*)&maxw, sizeof (maxw));
|
|
}
|
|
continue;
|
|
}
|
|
|
|
w_cmap_iter w_cmap_pos = w_cmap.find
|
|
(current_off->second);
|
|
if (w_cmap_pos != w_cmap.end()) {
|
|
coll_map_pos = coll_map_.find (w_cmap_pos->second);
|
|
out.write ((char*)&coll_map_pos->second.order,
|
|
sizeof (coll_map_pos->second.order));
|
|
weights = coll_map_pos->second.weights;
|
|
for (int j = 0; j < collate_out_.num_weights; ++j) {
|
|
for (int k = 0; k < weights[j].size; ++k) {
|
|
out.write ((char*)&weights[j].weight[k],
|
|
sizeof (weights[j].weight[k]));
|
|
}
|
|
for (int c = weights[j].size;
|
|
c < collate_out_.longest_weight; ++c)
|
|
out.write ((char*)&maxw, sizeof (maxw));
|
|
}
|
|
++current_off;
|
|
}
|
|
|
|
else {
|
|
ce_map_iter ce_map_it = ce_map_.find (current_off->second);
|
|
if (ce_map_it != ce_map_.end()
|
|
&& ce_map_it->second.coll_val != UINT_MAX) {
|
|
out.write ((char*)&ce_map_it->second.order,
|
|
sizeof (ce_map_it->second.order));
|
|
weights = ce_map_it->second.weights;
|
|
for (int j = 0; j < collate_out_.num_weights; ++j) {
|
|
for (int k = 0; k < weights[j].size; ++k) {
|
|
out.write ((char*)&weights[j].weight[k],
|
|
sizeof (weights[j].weight[k]));
|
|
}
|
|
for (int c = weights[j].size;
|
|
c < collate_out_.longest_weight; ++c)
|
|
out.write ((char*)&maxw, sizeof (maxw));
|
|
}
|
|
|
|
}
|
|
++current_off;
|
|
}
|
|
}
|
|
|
|
// maske sure that we got to the undefined value
|
|
if (!undefined_written && collate_out_.undefined_optimization) {
|
|
if (off_idx != undef_char_info_.offset)
|
|
issue_diag (E_RANGE, true, 0,
|
|
"current_off : %d, index : %d, giving up\n",
|
|
undef_char_info_.offset, off_idx);
|
|
|
|
weights = undef_char_info_.weights;
|
|
out.write ((char*)&undef_char_info_.order,
|
|
sizeof (undef_char_info_.order));
|
|
for (int j = 0; j < collate_out_.num_weights; ++j) {
|
|
for (int k = 0; k < weights[j].size; ++k) {
|
|
out.write ((char*)&weights[j].weight[k],
|
|
sizeof (weights[j].weight[k]));
|
|
}
|
|
for (int c = weights[j].size;
|
|
c < collate_out_.longest_weight; ++c)
|
|
out.write ((char*)&maxw, sizeof (maxw));
|
|
}
|
|
}
|
|
|
|
// now print out the narrow character tables
|
|
for (char_offs_it = char_offs_.begin();
|
|
char_offs_it != char_offs_.end(); ++char_offs_it) {
|
|
for (unsigned int c = char_offs_it->second.first_offset;
|
|
c <= UCHAR_MAX; ++c) {
|
|
out.write ((const char*)&char_offs_it->second.off[c],
|
|
sizeof (char_offs_it->second.off[c]));
|
|
}
|
|
}
|
|
|
|
// now print out the wide character tables
|
|
for (w_to_n_coll_it = w_to_n_coll_.begin();
|
|
w_to_n_coll_it != w_to_n_coll_.end(); ++w_to_n_coll_it) {
|
|
for (unsigned int c = w_to_n_coll_it->second.first_offset;
|
|
c <= UCHAR_MAX; ++c) {
|
|
out.write ((const char*)&w_to_n_coll_it->second.off[c],
|
|
sizeof (w_to_n_coll_it->second.off[c]));
|
|
}
|
|
}
|
|
|
|
// now print the narrow character table offsets
|
|
unsigned int current_offset = 0;
|
|
for (char_offs_it = char_offs_.begin();
|
|
char_offs_it != char_offs_.end(); ++char_offs_it) {
|
|
out.write ((const char*)¤t_offset, sizeof (current_offset));
|
|
current_offset += (UCHAR_MAX + 1
|
|
- char_offs_it->second.first_offset);
|
|
}
|
|
|
|
// now print the wide character table offsets
|
|
current_offset = 0;
|
|
for (w_to_n_coll_it = w_to_n_coll_.begin();
|
|
w_to_n_coll_it != w_to_n_coll_.end(); ++w_to_n_coll_it) {
|
|
out.write ((const char*)¤t_offset, sizeof (current_offset));
|
|
current_offset += (UCHAR_MAX + 1
|
|
- w_to_n_coll_it->second.first_offset);
|
|
}
|
|
|
|
// now print the narrow ce table offsets
|
|
current_offset = 0;
|
|
for (n_ce_offs_it = n_ce_offs_.begin();
|
|
n_ce_offs_it != n_ce_offs_.end(); ++n_ce_offs_it) {
|
|
out.write ((const char*)¤t_offset, sizeof (current_offset));
|
|
current_offset += (n_ce_offs_it->second.last_offset
|
|
- n_ce_offs_it->second.first_offset + 1);
|
|
}
|
|
|
|
// now print the wide ce table offsets
|
|
current_offset = 0;
|
|
for (w_ce_offs_it = w_ce_offs_.begin();
|
|
w_ce_offs_it != w_ce_offs_.end(); ++w_ce_offs_it) {
|
|
out.write ((const char*)¤t_offset, sizeof (current_offset));
|
|
current_offset += (w_ce_offs_it->second.last_offset
|
|
- w_ce_offs_it->second.first_offset + 1);
|
|
}
|
|
|
|
|
|
// now print out the narrow character tables starting character
|
|
for (char_offs_it = char_offs_.begin();
|
|
char_offs_it != char_offs_.end(); ++char_offs_it) {
|
|
const char off = char ((char_offs_it->second).first_offset);
|
|
out << off;
|
|
}
|
|
|
|
// now print out the wide character tables starting character
|
|
for (w_to_n_coll_it = w_to_n_coll_.begin();
|
|
w_to_n_coll_it != w_to_n_coll_.end(); ++w_to_n_coll_it) {
|
|
const char off = char ((w_to_n_coll_it->second).first_offset);
|
|
out << off;
|
|
}
|
|
|
|
// now print out the narrow ce tables starting character
|
|
for (n_ce_offs_it = n_ce_offs_.begin();
|
|
n_ce_offs_it != n_ce_offs_.end(); ++n_ce_offs_it) {
|
|
const char off = char ((n_ce_offs_it->second).first_offset);
|
|
out << off;
|
|
}
|
|
|
|
// now print out the wide ce tables starting character
|
|
for (w_ce_offs_it = w_ce_offs_.begin();
|
|
w_ce_offs_it != w_ce_offs_.end(); ++w_ce_offs_it) {
|
|
const char off = char ((w_ce_offs_it->second).first_offset);
|
|
out << off;
|
|
}
|
|
|
|
// now print out the narrow ce tables ending character
|
|
for (n_ce_offs_it = n_ce_offs_.begin();
|
|
n_ce_offs_it != n_ce_offs_.end(); ++n_ce_offs_it) {
|
|
const char off = char ((n_ce_offs_it->second).last_offset);
|
|
out << off;
|
|
}
|
|
|
|
// now print out the wide ce tables ending character
|
|
for (w_ce_offs_it = w_ce_offs_.begin();
|
|
w_ce_offs_it != w_ce_offs_.end(); ++w_ce_offs_it) {
|
|
const char off = char ((w_ce_offs_it->second).last_offset);
|
|
out << off;
|
|
}
|
|
|
|
// finally write the codeset and charmap names
|
|
out << charmap_.get_code_set_name() << std::ends
|
|
<< charmap_.get_charmap_name() << std::ends;
|
|
}
|
|
|
|
|
|
void Def::init_coll_map() {
|
|
|
|
rw_cmap_iter rw_cmap_pos;
|
|
collate_info_t tmp = {UINT_MAX, UINT_MAX, 0, 0};
|
|
|
|
for (rw_cmap_pos = charmap_.get_rw_cmap().begin();
|
|
rw_cmap_pos != charmap_.get_rw_cmap().end();
|
|
++rw_cmap_pos) {
|
|
coll_map_.insert (std::make_pair (rw_cmap_pos->first, tmp));
|
|
}
|
|
}
|
|
|
|
|
|
// In the preprocess_collate member function the collate section is
|
|
// preprocessed and the result of the preprocessing is stored in-memory
|
|
// as a number of lists; the content of these lists is then preprocessed
|
|
// acoording to the reorder statements; the result is then passed
|
|
// on to process_collate.
|
|
void Def::preprocess_collate ()
|
|
{
|
|
int nesting_level = 0;
|
|
|
|
do {
|
|
// fetch next token
|
|
next = scanner_.next_token();
|
|
|
|
switch (next.token) {
|
|
case Scanner::tok_end:
|
|
next = scanner_.next_token();
|
|
if (next.token == Scanner::tok_collate) {
|
|
// end of collation block
|
|
if (nesting_level == 0)
|
|
return;
|
|
|
|
nesting_level--;
|
|
scanner_.close ();
|
|
} else
|
|
issue_diag (E_SYNTAX, true, &next,
|
|
"wrong section name in END directive\n");
|
|
|
|
break;
|
|
case Scanner::tok_copy: {
|
|
next = scanner_.next_token();
|
|
if (next.token != Scanner::tok_string)
|
|
issue_diag (E_SYNTAX, true, &next,
|
|
"expected string following \"copy\" directive\n");
|
|
|
|
// bump up the nesting level
|
|
++nesting_level;
|
|
|
|
issue_diag (I_STAGE, false, 0, "processing copy directive\n");
|
|
|
|
// open the file
|
|
scanner_.open (get_pathname (strip_quotes (next.name), next.file));
|
|
|
|
// get comment char and escape char;
|
|
// these informations are stored by the scanner
|
|
while ((next = scanner_.next_token ()).token
|
|
!= Scanner::tok_collate ){
|
|
// the LC_IDENTIFICATION section may also have a
|
|
// LC_COLLATE token that will mess up the parsing
|
|
if (next.token == Scanner::tok_ident) {
|
|
while ((next = scanner_.next_token()).token
|
|
!= Scanner::tok_end );
|
|
next = scanner_.next_token();
|
|
}
|
|
}
|
|
|
|
break;
|
|
}
|
|
|
|
// a collating element definition defines a symbolic name that
|
|
// represents symbolically the congregation of two other symbolic
|
|
// names which have to be present in the character map;
|
|
// the form of the phrase is:
|
|
// collating-element sym from ("string" | (sym sym+))
|
|
case Scanner::tok_coll_elem: {
|
|
next = scanner_.next_token();
|
|
// we expect a symbolic name
|
|
if (next.token != Scanner::tok_sym_name)
|
|
issue_diag (E_SYNTAX, true, &next,
|
|
"symbolic name expected following "
|
|
"collating-element\n");
|
|
// symbolic name
|
|
token_t sym (next);
|
|
|
|
next = scanner_.next_token ();
|
|
if (next.token != Scanner::tok_from)
|
|
issue_diag (E_SYNTAX, true, &next,
|
|
"illegal collating-element expression\n");
|
|
|
|
token_list_t ce_def_list;
|
|
next = scanner_.next_token ();
|
|
if (next.token == Scanner::tok_string) {
|
|
ce_def_list.push_back (next);
|
|
ce_list_.push_back (std::make_pair(sym,ce_def_list));
|
|
} else if (next.token == Scanner::tok_sym_name) {
|
|
do {
|
|
ce_def_list.push_back (next);
|
|
next = scanner_.next_token();
|
|
} while (next.token != Scanner::tok_nl);
|
|
ce_list_.push_back (std::make_pair(sym,ce_def_list));
|
|
} else
|
|
issue_diag (E_SYNTAX, true, &next,
|
|
"illegal collating-element expression\n");
|
|
break;
|
|
}
|
|
case Scanner::tok_coll_sym:
|
|
next = scanner_.next_token();
|
|
if (next.token != Scanner::tok_sym_name)
|
|
issue_diag (E_SYNTAX, true, &next,
|
|
"symbolic name expected following "
|
|
"collating-symbol\n");
|
|
cs_list_.push_back (next);
|
|
break;
|
|
case Scanner::tok_script:
|
|
next = scanner_.next_token ();
|
|
if (next.token != Scanner::tok_sym_name)
|
|
issue_diag (W_SYNTAX, false, &next,
|
|
"expecting script name, got %s\n",
|
|
next.name.c_str ());
|
|
script_list_.push_back(next);
|
|
break;
|
|
case Scanner::tok_sym_name:
|
|
sym_list_.push_back (next);
|
|
break;
|
|
case Scanner::tok_order_start:
|
|
preprocess_order();
|
|
break;
|
|
|
|
case Scanner::tok_reorder:
|
|
preprocess_reorder ();
|
|
break;
|
|
|
|
case Scanner::tok_reorder_section:
|
|
preprocess_reorder_section ();
|
|
break;
|
|
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
while (Scanner::tok_end_tokens != next.token);
|
|
}
|
|
|
|
|
|
// The task of preprocess_order is to parse and model the content of the
|
|
// order sections in the input files
|
|
void Def::preprocess_order ()
|
|
{
|
|
// one order-start keyword has been encountered;
|
|
// push a collate_section down the list and use it
|
|
while (next.token != Scanner::tok_order_end) {
|
|
section_list_.push_back (collate_section_t());
|
|
collate_section_t& section = section_list_.back ();
|
|
|
|
next = scanner_.next_token ();
|
|
|
|
// store the orders
|
|
while (next.token != Scanner::tok_nl) {
|
|
if (next.token == Scanner::tok_sym_name) {
|
|
// symbolic name, has to be a section name
|
|
section.name = next.name;
|
|
} else {
|
|
// might be a combination of order and position
|
|
// do the same until a better way is found
|
|
section.order.push_back (next);
|
|
}
|
|
|
|
// fetch next token
|
|
next = scanner_.next_token ();
|
|
}
|
|
|
|
// unnamed sections
|
|
if (section.name == "")
|
|
section.name = "unnamed";
|
|
|
|
issue_diag (I_STAGE, false, 0,
|
|
"processing %s order\n", section.name.c_str ());
|
|
|
|
// store the collation statements
|
|
while (!(next.token == Scanner::tok_order_end ||
|
|
next.token == Scanner::tok_order_start )) {
|
|
|
|
next = scanner_.next_token ();
|
|
switch (next.token) {
|
|
case Scanner::tok_end:
|
|
issue_diag (E_SYNTAX, true, &next,
|
|
"unexpected END directive while "
|
|
"parsing collate section\n");
|
|
case Scanner::tok_nl:
|
|
continue;
|
|
case Scanner::tok_comment:
|
|
scanner_.ignore_line ();
|
|
break;
|
|
case Scanner::tok_undefined:
|
|
case Scanner::tok_sym_name:
|
|
{
|
|
token_t sym (next);
|
|
section.entries.push_back (
|
|
std::make_pair(sym,token_list_t()));
|
|
collate_entry_t& entry = section.entries.back ();
|
|
|
|
next = scanner_.next_token ();
|
|
while (next.token != Scanner::tok_nl) {
|
|
entry.second.push_back (next);
|
|
next = scanner_.next_token ();
|
|
}
|
|
break;
|
|
}
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
// Handles the reorder statements in the form:
|
|
// reorder-after <sym>
|
|
// <sym> <sym><sym>...
|
|
// OR
|
|
// reorder-after <sym>
|
|
// .. <sym><sym>.....
|
|
// followed by reorder-end statement or another reorder-after
|
|
void Def::preprocess_reorder ()
|
|
{
|
|
while (true) {
|
|
// process one or more statements grouped
|
|
// under a reorder section
|
|
|
|
// retrieve the symbol
|
|
next = scanner_.next_token ();
|
|
if (next.token != Scanner::tok_sym_name)
|
|
issue_diag (E_SYNTAX, true, &next,
|
|
"expecting symbolic name, got %s\n",
|
|
next.name.c_str ());
|
|
|
|
// store the symbolic name
|
|
token_t sym (next);
|
|
collate_entry_list_t entries_list;
|
|
while (true) {
|
|
|
|
// process the statements
|
|
next = scanner_.next_token ();
|
|
|
|
if (next.token == Scanner::tok_nl )
|
|
continue;
|
|
else if ( next.token == Scanner::tok_sym_name
|
|
|| next.token == Scanner::tok_hex_ellipsis) {
|
|
// the line will contain one single symbolic name
|
|
// or a complete collation statement
|
|
collate_entry_t entry;
|
|
entry.first = next;
|
|
|
|
next = scanner_.next_token ();
|
|
while (next.token != Scanner::tok_nl) {
|
|
entry.second.push_back (next);
|
|
next = scanner_.next_token ();
|
|
}
|
|
|
|
// add the entry to the list
|
|
entries_list.push_back (entry);
|
|
} else if (next.token == Scanner::tok_reorder) {
|
|
// call insert_entry
|
|
if (!entries_list.empty ())
|
|
insert_entries (sym, entries_list);
|
|
entries_list.clear ();
|
|
break;
|
|
} else if (next.token == Scanner::tok_reorder_end ) {
|
|
// call insert_entry
|
|
// call insert_entry
|
|
if (!entries_list.empty ())
|
|
insert_entries (sym, entries_list);
|
|
return;
|
|
} else {
|
|
issue_diag (E_SYNTAX, true, &next,
|
|
"unexpected token : %s in a reorder block\n",
|
|
next.name.c_str ());
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
void Def::preprocess_reorder_section ()
|
|
{
|
|
while (next.token != Scanner::tok_reorder_section_end) {
|
|
// process one or more statements grouped
|
|
// under a reorder section
|
|
|
|
// retrieve the symbol
|
|
next = scanner_.next_token ();
|
|
token_t sym (next);
|
|
|
|
// process the statements
|
|
while (!(next.token == Scanner::tok_reorder_section ||
|
|
next.token == Scanner::tok_reorder_section_end)) {
|
|
|
|
next = scanner_.next_token ();
|
|
switch (next.token) {
|
|
case Scanner::tok_nl:
|
|
continue;
|
|
case Scanner::tok_sym_name: {
|
|
// the line will contain one single symbolic name
|
|
// which is the name of a section
|
|
next = scanner_.next_token ();
|
|
if (next.token != Scanner::tok_sym_name) {
|
|
issue_diag (E_SYNTAX, true, &next,
|
|
"expecting section name, got %s\n",
|
|
next.name.c_str ());
|
|
}
|
|
|
|
std::list<collate_section_t>::iterator beg =
|
|
section_list_.begin ();
|
|
std::list<collate_section_t>::iterator end =
|
|
section_list_.end ();
|
|
std::list<collate_section_t>::iterator ref_it, mov_it;
|
|
for (ref_it = beg;
|
|
ref_it->name != sym.name && ref_it != end;
|
|
++ref_it);
|
|
for (mov_it = beg;
|
|
mov_it->name != next.name && mov_it != end;
|
|
++mov_it);
|
|
|
|
if (ref_it == end || mov_it == end)
|
|
issue_diag (E_SYNTAX, true, &next,
|
|
"missing sections %s %s requested\n",
|
|
sym.name.c_str (), next.name.c_str ());
|
|
|
|
// replace it
|
|
section_list_.splice (++ref_it, section_list_,
|
|
mov_it, mov_it);
|
|
|
|
// replace the insertion point
|
|
sym = next;
|
|
break;
|
|
}
|
|
default:
|
|
issue_diag (E_SYNTAX, true, &next,
|
|
"unexpected token : %s in a "
|
|
"reorder section block\n",
|
|
next.name.c_str ());
|
|
break;
|
|
}
|
|
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
void Def::list_collate ()
|
|
{
|
|
// print the script list
|
|
token_list_t::iterator script_it = script_list_.begin ();
|
|
while (script_it != script_list_.end ()) {
|
|
std::cout << "script " << (script_it++)->name << '\n';
|
|
}
|
|
std::cout << '\n';
|
|
|
|
// print the collating symbols' list
|
|
token_list_t::iterator cs_it = cs_list_.begin ();
|
|
while (cs_it != cs_list_.end ()) {
|
|
std::cout << "collating-symbol " << (cs_it++)->name << '\n';
|
|
}
|
|
std::cout << '\n';
|
|
|
|
// print the collating elements list
|
|
std::list<collate_elem_t>::iterator ce_it = ce_list_.begin ();
|
|
while (ce_it != ce_list_.end ()) {
|
|
std::cout << "collating-element " << ce_it->first.name
|
|
<< " from ";
|
|
token_list_t::iterator ce_def_it = ce_it->second.begin ();
|
|
for (;ce_def_it != ce_it->second.end (); ce_def_it++) {
|
|
std::cout << ce_def_it->name << ";";
|
|
}
|
|
std::cout << '\n';
|
|
}
|
|
std::cout << '\n';
|
|
|
|
// print out general/global symbols' list
|
|
token_list_t::iterator sym_it = sym_list_.begin ();
|
|
while (sym_it != sym_list_.end ()) {
|
|
std::cout << (sym_it++)->name << '\n';
|
|
}
|
|
std::cout << '\n';
|
|
|
|
// print out sections
|
|
std::list<collate_section_t>::iterator sc_it =
|
|
section_list_.begin ();
|
|
while (sc_it != section_list_.end ()) {
|
|
// print prolog
|
|
std::cout << "order_start ";
|
|
if (sc_it->name != "unnamed")
|
|
std::cout << sc_it->name << ";";
|
|
|
|
token_list_t::iterator it = sc_it->order.begin ();
|
|
while (it != sc_it->order.end ())
|
|
std::cout << (it++)->name << ";";
|
|
std::cout << '\n';
|
|
|
|
std::list<collate_entry_t>::iterator e_it =
|
|
sc_it->entries.begin ();
|
|
while (e_it != sc_it->entries.end ()) {
|
|
std::cout << e_it->first.name << " ";
|
|
token_list_t::iterator w_it = e_it->second.begin ();
|
|
while (w_it != e_it->second.end ())
|
|
std::cout << (w_it++)->name << ";";
|
|
std::cout << '\n';
|
|
++e_it;
|
|
}
|
|
|
|
++sc_it;
|
|
}
|
|
std::cout << '\n' << "order_end\n";
|
|
}
|
|
|
|
|
|
// FIXME - modify the algorithm to get log(N) complexity
|
|
// Inserts an entry inside the section information;
|
|
// searching for the appropriate entry in the list is
|
|
// done in linear time
|
|
bool Def::insert_entries (token_t& s, collate_entry_list_t& e)
|
|
{
|
|
// first remove these entries if found
|
|
collate_entry_list_t::iterator r_it = e.begin ();
|
|
for (; r_it != e.end (); ++r_it)
|
|
remove_entry (*r_it);
|
|
|
|
// first search through the symbols list; if found check
|
|
// the collate_entry object; it should not have weights
|
|
token_list_t::iterator it = sym_list_.begin ();
|
|
for (; it != sym_list_.end (); ++it) {
|
|
if (it->name != s.name)
|
|
continue;
|
|
|
|
// FIXME - all statements have to have no weights if they are
|
|
// to be inserted after collation symbols outside section boundaries;
|
|
// found it; check one collate_entry_t object
|
|
if (!e.begin ()->second.empty ()) {
|
|
issue_diag (E_REORD, true, &s,
|
|
"requested reorder-after: reference "
|
|
"symbol %s was found outside section boundaries "
|
|
"and the objects to be reordered are collation "
|
|
"definitions\n", s.name.c_str ());
|
|
}
|
|
|
|
// insert the symbolic name there
|
|
collate_entry_list_t::iterator e_it = e.begin ();
|
|
for (; e_it != e.end (); ++e_it)
|
|
it = sym_list_.insert (++it, e_it->first);
|
|
|
|
return true;
|
|
}
|
|
|
|
// if not successful then search in each section
|
|
std::list<collate_section_t>::iterator sect_it = section_list_.begin ();
|
|
while (sect_it != section_list_.end ()) {
|
|
collate_entry_list_t::iterator e_it = sect_it->entries.begin ();
|
|
for (; e_it != sect_it->entries.end (); ++e_it) {
|
|
if (e_it->first.name != s.name)
|
|
continue;
|
|
|
|
// found it; insert entry
|
|
sect_it->entries.insert (++e_it, e.begin (), e.end ());
|
|
return true;
|
|
}
|
|
++sect_it;
|
|
}
|
|
|
|
issue_diag (W_REORD, false, &s,
|
|
"requested reorder-after: reference "
|
|
"symbol %s was not found \n", s.name.c_str ());
|
|
|
|
return false;
|
|
}
|
|
|
|
|
|
// FIXME - modify the algorithm to get log(N) complexity
|
|
void Def::remove_entry (collate_entry_t& e)
|
|
{
|
|
// search in the sym_list_ and in the sections
|
|
if (e.second.empty ()) {
|
|
token_list_t::iterator it = sym_list_.begin ();
|
|
for (; it != sym_list_.end (); ++it) {
|
|
if (it->name != e.first.name)
|
|
continue;
|
|
|
|
sym_list_.erase (it);
|
|
return;
|
|
}
|
|
}
|
|
|
|
// search in the sections
|
|
std::list<collate_section_t>::iterator sect_it = section_list_.begin ();
|
|
while (sect_it != section_list_.end ()) {
|
|
std::list<collate_entry_t>::iterator e_it = sect_it->entries.begin ();
|
|
for (; e_it != sect_it->entries.end (); ++e_it) {
|
|
if (e_it->first.name != e.first.name)
|
|
continue;
|
|
|
|
sect_it->entries.erase (e_it);
|
|
return;
|
|
}
|
|
++sect_it;
|
|
}
|
|
return ;
|
|
}
|
|
|
|
|
|
|
|
bool Def::get_weight ( token_t& w,
|
|
Weights_t* weights,
|
|
int weight_num)
|
|
{
|
|
const std::map<std::string, wchar_t>& w_cmap = charmap_.get_w_cmap ();
|
|
|
|
bool ret = false;
|
|
weights[weight_num].size = 1;
|
|
ce_map_iter ce_map_it;
|
|
if(w.token == Scanner::tok_sym_name) {
|
|
w_cmap_iter w_cmap_pos = w_cmap.find (w.name);
|
|
if (w_cmap_pos != w_cmap.end()){
|
|
coll_map_iter coll_map_pos = coll_map_.find(w_cmap_pos->second);
|
|
|
|
if (coll_map_pos->second.coll_val == UINT_MAX)
|
|
issue_diag (E_SYMUSED, true,
|
|
&w, "symbolic name %s "
|
|
"used as weight before being assigned a "
|
|
"collation value\n", w.name.c_str());
|
|
|
|
// store weight
|
|
weights[weight_num].weight[0] = coll_map_pos->second.coll_val;
|
|
}
|
|
else if ((ce_map_it = ce_map_.find (w.name))
|
|
!= ce_map_.end())
|
|
weights[weight_num].weight[0] = ce_map_it->second.coll_val;
|
|
else {
|
|
cs_map_iter cs_it = cs_map_.find (w.name);
|
|
if (cs_it == cs_map_.end ())
|
|
issue_diag (E_SYNTAX, true,
|
|
&w, "symbolic name %s not found\n",
|
|
w.name.c_str());
|
|
|
|
weights[weight_num].weight[0] = cs_it->second;
|
|
}
|
|
}
|
|
else if (w.token == Scanner::tok_char_value) {
|
|
// the weight is given in numerical form
|
|
const char* next_val =
|
|
std::strchr (w.name.c_str (), scanner_.escape_char ());
|
|
|
|
assert (0 != next_val);
|
|
|
|
const char* next_wt = std::strchr (w.name.c_str (), ';');
|
|
|
|
while (weight_num < collate_out_.num_weights) {
|
|
|
|
std::size_t c;
|
|
|
|
for (c = 0; *next_val && (!next_wt || next_val < next_wt); ++c) {
|
|
|
|
const char* end = 0;
|
|
|
|
weights [weight_num].weight [c] =
|
|
scanner_.convert_escape (next_val, &end, true);
|
|
|
|
assert (0 != end);
|
|
|
|
next_val = end;
|
|
}
|
|
|
|
weights [weight_num++].size = c;
|
|
|
|
if (next_wt)
|
|
next_wt = std::strchr (next_val, ';');
|
|
|
|
}
|
|
|
|
}
|
|
else if(w.token == Scanner::tok_ignore) {
|
|
// use the special weight 0 for IGNORE weights
|
|
weights[weight_num].size = 1;
|
|
weights[weight_num].weight[0] = 0;
|
|
}
|
|
else if (w.token == Scanner::tok_string) {
|
|
// the weights are given either in symbolic name form (e.g.,
|
|
// "<symbolic-name>" or in the form of a quoted multibyte
|
|
// character string (e.g., "\001\d010\x16\")
|
|
const std::string tmp (w.name.substr (1, w.name.size() - 2));
|
|
|
|
// keeps track of the length of the weight
|
|
unsigned char k = 0;
|
|
|
|
// iterate thru the string content and retrieve the symbols
|
|
std::string::const_iterator it = tmp.begin ();
|
|
|
|
const char escape = scanner_.escape_char ();
|
|
|
|
while (it != tmp.end ()) {
|
|
// weight in string form
|
|
std::string wsym;
|
|
|
|
// next comes a symbolic name
|
|
if (*it == '<') {
|
|
while (*it != '>') {
|
|
if (*it == escape)
|
|
++it;
|
|
|
|
wsym += *it++;
|
|
}
|
|
wsym += *it++;
|
|
|
|
// wsym has the symbolic name, lookup for it in
|
|
// the character map, collating-symbol map,
|
|
// and collating-element map
|
|
w_cmap_iter w_cm_pos = w_cmap.find(wsym);
|
|
cs_map_iter cs_it = cs_map_.find (wsym);
|
|
ce_map_iter ce_it = ce_map_.find (wsym);
|
|
|
|
if (w_cm_pos != w_cmap.end()) {
|
|
// is in the character map, check its associated
|
|
// collation value
|
|
coll_map_iter coll_it = coll_map_.find(w_cm_pos->second);
|
|
if (coll_it == coll_map_.end())
|
|
issue_diag (E_SYMUSED, true, &w,
|
|
"weight %s not defined\n",
|
|
wsym.c_str());
|
|
if (coll_it->second.coll_val == UINT_MAX)
|
|
issue_diag (E_SYMUSED, true,
|
|
&w, "symbolic name %s "
|
|
"used as weight before being assigned a "
|
|
"collation value\n", wsym.c_str());
|
|
|
|
weights[weight_num].weight[k++] = coll_it->second.coll_val;
|
|
}
|
|
else if (ce_it != ce_map_.end()) {
|
|
// it is in the collating-element map
|
|
weights[weight_num].weight[k++] = ce_it->second.coll_val;
|
|
}
|
|
else if (cs_it != cs_map_.end()) {
|
|
// it is in the collating-symbol map
|
|
weights[weight_num].weight[k++] = cs_it->second;
|
|
}
|
|
else {
|
|
// it is not in any of the maps, that's an error
|
|
issue_diag (E_SYNTAX, true, &w,
|
|
"symbolic name %s not found\n",
|
|
wsym.c_str());
|
|
}
|
|
}
|
|
else if (*it == escape) {
|
|
|
|
// weight is given in a quoted escape form
|
|
const char* const beg = tmp.c_str () + (it - tmp.begin ());
|
|
const char* end = 0;
|
|
|
|
weights [weight_num].weight [k++] =
|
|
scanner_.convert_escape (beg, &end, true);
|
|
|
|
assert (0 != end);
|
|
|
|
it += end - beg;
|
|
|
|
break;
|
|
}
|
|
else
|
|
issue_diag (E_SYNTAX, true, &w,
|
|
"illegal string content as a weight");
|
|
}
|
|
|
|
weights[weight_num].size = k;
|
|
if (k > collate_out_.longest_weight)
|
|
collate_out_.longest_weight = k;
|
|
|
|
}
|
|
else if (w.token == Scanner::tok_abs_ellipsis)
|
|
// return true if ellipsis are embedded in the weight
|
|
ret = true;
|
|
else
|
|
warnings_occurred_ =
|
|
issue_diag (W_SYNTAX, false, &w,
|
|
"illegal token %s found in collation definition\n",
|
|
w.name.c_str()) || warnings_occurred_;
|
|
|
|
return ret;
|
|
}
|
|
|
|
|
|
// if undefined optimization is on then only those characters that have
|
|
// been defined should go into the valid set, otherwise all characters go
|
|
// into the set.
|
|
void Def::gen_valid_coll_wchar_set () {
|
|
|
|
if (!valid_coll_wchar_set_.empty())
|
|
return;
|
|
|
|
create_wchar_utf8_table();
|
|
|
|
for (coll_map_iter coll_it = coll_map_.begin();
|
|
coll_it != coll_map_.end(); ++coll_it) {
|
|
if (coll_it->second.offset != UINT_MAX
|
|
|| !collate_out_.undefined_optimization){
|
|
|
|
std::string valid = utf8_encode (coll_it->first);
|
|
valid = valid.substr (0, valid.size() - 1);
|
|
|
|
while (valid.size() > 0){
|
|
valid_coll_wchar_set_.insert (valid);
|
|
valid = valid.substr(0, valid.size() - 1);
|
|
}
|
|
}
|
|
}
|
|
}
|