CKG/extern/stdcxx/4.2.1/util/codecvt.cpp

/***************************************************************************
 *
 * codecvt.cpp
 *
 * $Id: codecvt.cpp 449092 2006-09-22 21:16:16Z sebor $
 *
 ***************************************************************************
 *
 * Licensed to the Apache Software  Foundation (ASF) under one or more
 * contributor  license agreements.  See  the NOTICE  file distributed
 * with  this  work  for  additional information  regarding  copyright
 * ownership.   The ASF  licenses this  file to  you under  the Apache
 * License, Version  2.0 (the  "License"); you may  not use  this file
 * except in  compliance with the License.   You may obtain  a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the  License is distributed on an  "AS IS" BASIS,
 * WITHOUT  WARRANTIES OR CONDITIONS  OF ANY  KIND, either  express or
 * implied.   See  the License  for  the  specific language  governing
 * permissions and limitations under the License.
 *
 * Copyright 2001-2006 Rogue Wave Software.
 *
 **************************************************************************/

#include "diagnostic.h"   // for issue_diag()
#include "def.h"          // for Def

#include "path.h"         // for get_pathname()
#include "scanner.h"      // for scanner

#include <cassert>        // for assert()
#include <climits>        // for UCHAR_MAX
#include <cstring>        // for memset()
#include <fstream>        // for ifstream, ofstream


typedef std::map<std::string, wchar_t>::const_iterator n_cmap_citer2;


std::size_t Def::
gen_mbchar_tables (codecvt_offsets_map_t           &tab,
                   std::map<std::string, unsigned> &off_map,
                   const std::string               &charp /* = "" */,
                   unsigned                         tabno /* = 0 */)
{
    // upon the first call (but not during subsequent recursive calls)
    // generate a set of multibyte prefixes from the set of all known
    // multibyte characters
    static unsigned               ntabs   = 0;
    static std::set<std::string>* pfx_set = 0;

    const n_cmap_citer2 mb_map_end = charmap_.get_mb_cmap ().end ();

    if (0 == pfx_set) {
        pfx_set = new std::set<std::string>;

        // iterate over the range of valid multibyte characters
        // obtained from the charmap and generate a complete
        // subset of non-empty multibyte prefixes from each
        unsigned off = 0;

        const n_cmap_citer2 mb_map_begin = charmap_.get_mb_cmap ().begin ();

        for (n_cmap_citer2 it = mb_map_begin; it != mb_map_end; ++it, ++off) {

            // insert the ordinal number of each multibyte character
            // into a map for fast lookup later
            off_map.insert (std::make_pair (it->first, off));

            // generate non-empty prefixes up to one byte less
            // in length than the complete multibyte character
            for (std::string prefix = it->first; 1 < prefix.size (); ) {
                prefix = prefix.substr (0, prefix.size () - 1);
                pfx_set->insert (prefix);
            }
        }
    }

    // number of valid characters inserted into the tables
    std::size_t nchars = 0;

    // an array of offsets to the multibyte character or to the next
    // array containing such offsets (defined recursively for up to
    // MB_CUR_MAX levels of nesting)
    codecvt_offset_tab_t* const offsets = new codecvt_offset_tab_t;

    std::string mb_char (charp + '\0');

    for (unsigned i = 0; i <= UCHAR_MAX; ++i) {

        unsigned char cur_char = (unsigned char)i;

        mb_char [mb_char.size () - 1] = char (cur_char);

        if (mb_map_end == charmap_.get_mb_cmap ().find (mb_char)) {
            // mb_char is not a complete, valid multibyte character
            // check to see if it's a prefix of one

            if (pfx_set->find (mb_char) == pfx_set->end ()) {
                // mb_char is not a prefix of a valid multibyte
                // character, mark it invalide
                offsets->off [cur_char] = UINT_MAX;
            }
            else {
                // mb_char is a prefix of a valid multibyte character,
                // set the MSB to denote that it "continues" in the
                // table at the next higher offset
                offsets->off [cur_char] = ++ntabs | 0x80000000;

                // generate that table
                nchars += gen_mbchar_tables (tab, off_map, mb_char, ntabs);
            }
        }
        else {
            // mb_char is a complete, valid miltibyte character
            // insert its ordinal number (offset) into the array
            offsets->off [cur_char] = off_map.find (mb_char)->second;
            ++nchars;
        }
    }

    // insert the completely populated table into the map
    tab.insert (std::make_pair (tabno, offsets));

    if (0 == ntabs) {
        // clean up on return from the topmost (non-recursive) call
        delete pfx_set;
        pfx_set = 0;
    }

    return nchars;
}


std::size_t Def::
gen_wchar_tables (codecvt_offsets_map_t &tab,
                  const std::string     &charp /* = "" */,
                  unsigned int           tabno /* = 0 */)
{
    // upon the first call (but not during subsequent recursive calls)
    // generate a set of multibyte prefixes from the set of all known
    // multibyte characters
    static unsigned                            ntabs   = 0;
    static std::set<std::string>              *pfx_set = 0;
    static std::map<std::string, unsigned>    *off_map = 0;
    static std::map<std::string, std::string> *utf_map = 0;

    if (0 == utf_map) {
        pfx_set = new std::set<std::string>;
        off_map = new std::map<std::string, unsigned>;
        utf_map = new std::map<std::string, std::string>;

        const n_cmap_citer2 first = charmap_.get_mb_cmap ().begin ();
        const n_cmap_citer2 last  = charmap_.get_mb_cmap ().end ();

        unsigned off = 0;

        for (n_cmap_citer2 it = first; it != last; ++it) {

            off_map->insert (std::make_pair (it->first, off));

            off += it->first.size () + 1;

            std::string utf = utf8_encode (it->second);

            utf_map->insert (std::make_pair (utf, it->first));

            while (1 < utf.size ()) {
                utf = utf.substr (0, utf.size () - 1);
                pfx_set->insert (utf);
            }
        }
    }

    codecvt_offset_tab_t* const offsets = new codecvt_offset_tab_t;

    // number of valid characters inserted into the tables
    std::size_t nchars = 0;

    std::string mb_char (charp + '\0');

    for (unsigned i = 0; i <= UCHAR_MAX; ++i) {

        unsigned char cur_char = (unsigned char)i;

        mb_char [mb_char.size () - 1] = char (cur_char);

        const wchar_utf8_iter it = utf_map->find (mb_char);
        if (it == utf_map->end ()) {
            if (pfx_set->find (mb_char) == pfx_set->end ()) {
                offsets->off [cur_char] = UINT_MAX;
            }
            else {
                offsets->off [cur_char] = ++ntabs | 0x80000000;

                nchars += gen_wchar_tables (tab, mb_char, ntabs);
            }
        }
        else {
            offsets->off [cur_char] = off_map->find (it->second)->second;

            ++nchars;
        }
    }

    tab.insert (std::make_pair (tabno, offsets));

    if (0 == ntabs) {
        // clean up
        delete pfx_set;
        delete utf_map;

        pfx_set = 0;
        utf_map = 0;
    }

    return nchars;
}


std::size_t Def::
gen_utf8_tables (codecvt_offsets_map_t           &tab,
                 std::map<std::string, unsigned> &off_map,
                 const std::string               &charp /* = "" */,
                 unsigned                         tabno /* = 0 */)
{
    static unsigned                        ntabs   = 0;
    static std::set<std::string>          *pfx_set = 0;
    static std::map<std::string, wchar_t> *utf_map = 0;

    if (0 == pfx_set) {
        pfx_set = new std::set<std::string>;

        const ucs4_cmap_iter first = charmap_.get_ucs4_cmap ().begin ();
        const ucs4_cmap_iter last  = charmap_.get_ucs4_cmap ().end ();

        for (ucs4_cmap_iter it = first; it != last; ++it) {

            for (std::string prefix = utf8_encode (it->second);
                 1 < prefix.size (); ) {
                prefix = prefix.substr (0, prefix.size () - 1);
                pfx_set->insert (prefix);
            }
        }
    }

    // the set of complete utf8 strings in the current character map
    typedef std::map<std::string, wchar_t>::iterator utf8_map_iter;

    if (0 == utf_map) {
        utf_map = new std::map<std::string, wchar_t>;

        const ucs4_cmap_iter first = charmap_.get_ucs4_cmap ().begin ();
        const ucs4_cmap_iter last  = charmap_.get_ucs4_cmap ().end ();

        for (ucs4_cmap_iter it = first; it != last; ++it) {
            const std::string utf = utf8_encode (it->second);
            utf_map->insert (std::make_pair (utf, it->second));
        }
    }

    codecvt_offset_tab_t* const offsets = new codecvt_offset_tab_t;

    // number of valid characters inserted into the tables
    std::size_t nchars = 0;

    std::string mb_char = charp + '\0';

    for (unsigned int i = 0; i <= UCHAR_MAX; ++i) {

        unsigned char cur_char = (unsigned char)i;

        mb_char [mb_char.size () - 1] = char (cur_char);

        const utf8_map_iter where = utf_map->find (mb_char);

        if (where == utf_map->end ()) {
            if (pfx_set->find (mb_char) == pfx_set->end ()) {
                offsets->off [cur_char] = UINT_MAX;
            }
            else {
                offsets->off [cur_char] = ++ntabs | 0x80000000;
                nchars += gen_utf8_tables (tab, off_map, mb_char, ntabs);
            }
        }
        else {
            // first get the symbolic name
            std::string str
                = charmap_.get_rucs4_cmap ().find (where->second)->second;

            // then get the internal encoding of the character
            const wchar_t int_enc = charmap_.get_w_cmap().find (str)->second;

            // then get the external encoding to use in a lookup in
            // mb_char_off_map
            str = charmap_.get_rmb_cmap ().find (int_enc)->second;

            offsets->off [cur_char] = off_map.find (str)->second;

            ++nchars;
        }
    }

    tab.insert (std::make_pair (tabno, offsets));

    if (0 == ntabs) {
        // clean up
        delete pfx_set;
        delete utf_map;

        pfx_set = 0;
        utf_map = 0;
    }
    return nchars;
}


void Def::
gen_xlit_data ()
{
    // data offset points to the beginning of the data containing
    // the narrow strings character encodings
    unsigned int data_offset = 0;

    // traverse the map and construct the map of offsets
    xlit_map_t::const_iterator it = xlit_map_.begin ();
    for (; it != xlit_map_.end (); ++it) {
        // insert pair(wchar_t value, offset of first string in data block)
        xlit_data_offset_map_.insert (
            std::make_pair (it->first,data_offset));

        // advance the data_offset value to the next "first" string
        std::list<std::string>::const_iterator sit =
            it->second.begin ();
        for (; sit != it->second.end (); ++sit) {
            data_offset += sit->size () + 1;
        }
        ++data_offset;
    }

    // create a new table (first), populate it with default values
    // and insert it in the tables map
    xlit_offset_table_t table0;
    unsigned int k;
    for (k = 0; k < UCHAR_MAX + 1; ++k)
        table0.offset_table [k] = UINT_MAX;

    // insert it into the map
    xlit_table_map_.insert (std::make_pair(0, table0));

    const xlit_map_t::const_iterator xlit_map_end = xlit_map_.end ();

    // traverse the map again and build the tables
    for (it = xlit_map_.begin (); it != xlit_map_end; ++it) {

        // encode the wchar_t value to UTF-8
        const std::string utf8_rep (utf8_encode (it->first));
        data_offset = xlit_data_offset_map_.find (it->first)->second;

        // traverse the utf8 representation string and create the
        // necessary tables and populate the indexes
        unsigned int table_idx = 0;

        const std::string::const_iterator utf8_rep_end = utf8_rep.end ();
        std::string::const_iterator       string_it    = utf8_rep.begin ();

        for (; string_it != utf8_rep_end; ++string_it) {
            // get the table corresponding to the current index and locate
            // the value at that index
            const xlit_table_map_t::iterator res =
                xlit_table_map_.find (table_idx);

            assert (res != xlit_table_map_.end ());

            // offset in table
            unsigned char off_idx = (unsigned char)*string_it;

            // res is the iterator pointing to the correct table in the map
            // check the index and if not populated, create a new table
            if (res->second.offset_table [off_idx] == UINT_MAX) {

                // if this is the last position in the string, then
                // fill the table position with the offset of the string data
                if ((string_it + 1) == utf8_rep.end ()) {
                    xlit_data_offset_map_t::const_iterator data_it =
                        xlit_data_offset_map_.find (it->first);
                    assert (data_it != xlit_data_offset_map_.end ());

                    // fill the table position with the found offset
                    res->second.offset_table [off_idx] = data_it->second;
                    continue;
                }

                // create a new table and append it to the map
                xlit_offset_table_t table;
                for (unsigned int i = 0; i < UCHAR_MAX + 1; ++i)
                    table.offset_table [i] = UINT_MAX;

                // insert it into the map
                unsigned int tmp = xlit_table_map_.size ();
                xlit_table_map_.insert (std::make_pair(tmp, table));

                // store its index at correct position in current table
                res->second.offset_table [off_idx] = tmp | 0x80000000;
                table_idx = tmp;
            } else {
                table_idx =
                    res->second.offset_table [off_idx] & 0x7FFFFFFF;
            }
        }
    }
}


void Def::
write_codecvt (std::string dir_name)
{
    // if it has been already written
    if (codecvt_written_)
        return;

    // compose the directory name
    ((dir_name += _RWSTD_PATH_SEP) += "..") += _RWSTD_PATH_SEP;
    dir_name += charmap_.get_code_set_name ();

    // check to see if the codecvt database already exists and
    // avoid recreating it if it does (as an optimization)
    if (std::ifstream (dir_name.c_str ())) {
        issue_diag (I_OPENWR, false, 0,
                    "%s exists, skipping\n", dir_name.c_str ());
        return;
    }

    //////////////////////////////////////////////////////////////////
    // generate multibyte conversion tables
    issue_diag (I_STAGE, false, 0, "generating multibyte tables\n");

    codecvt_offsets_map_t           mbchar_offs;
    std::map<std::string, unsigned> off_map;
    const std::size_t n_mbchars = gen_mbchar_tables (mbchar_offs, off_map);

    // generate wchar_t conversion tables
    issue_diag (I_STAGE, false, 0, "generating wchar_t tables\n");

    codecvt_offsets_map_t wchar_offs;
    const std::size_t n_wchars = gen_wchar_tables (wchar_offs);

    // generate UTF-8 conversion conversion tables
    issue_diag (I_STAGE, false, 0, "generating UTF-8 tables\n");

    codecvt_offsets_map_t uchar_offs;
    const std::size_t n_uchars = gen_utf8_tables (uchar_offs, off_map);

    // not needed beyond this point, clear it out
    off_map.clear ();

    // generate the transliteration tables and the transliteration data
    issue_diag (I_STAGE, false, 0, "generating transliteration tables\n");
    gen_xlit_data ();

    //////////////////////////////////////////////////////////////////
    // populate the codecvt structure before writing it out
    // in binary form to the file (the codecvt database)
    _RW::__rw_codecvt_t codecvt_out;
    std::memset (&codecvt_out, 0, sizeof codecvt_out);

    // calculate byte offsets within the structure
    codecvt_out.n_to_w_tab_off = 0;
    codecvt_out.w_to_n_tab_off = codecvt_out.n_to_w_tab_off
        + mbchar_offs.size () * (UCHAR_MAX + 1) * sizeof (unsigned);

    codecvt_out.utf8_to_ext_tab_off = codecvt_out.w_to_n_tab_off
        + wchar_offs.size () * (UCHAR_MAX + 1) * sizeof (unsigned);

    // insert the transliteration tables here
    codecvt_out.xliteration_off = codecvt_out.utf8_to_ext_tab_off
        + uchar_offs.size () * (UCHAR_MAX + 1) * sizeof (unsigned);

    codecvt_out.wchar_off = codecvt_out.xliteration_off
        + xlit_table_map_.size () * (UCHAR_MAX + 1) * sizeof (unsigned);

    codecvt_out.codeset_off = codecvt_out.wchar_off
        + charmap_.get_mb_cmap ().size () * 2 * sizeof (wchar_t);

    codecvt_out.charmap_off = codecvt_out.codeset_off
        + charmap_.get_code_set_name ().size () + 1 /* NUL */;

    const std::size_t mb_offset = codecvt_out.charmap_off
        + charmap_.get_charmap_name ().size () + 1 /* NUL */;

    // compute the size of narrow strings map which added to
    // mb_offset will give the start of the transliteration data
    std::size_t xlit_data_offset = mb_offset;

    mb_cmap_iter iter;

    for (iter = charmap_.get_mb_cmap ().begin();
         iter != charmap_.get_mb_cmap().end(); ++iter) {
        xlit_data_offset += iter->first.size() + 1;
    }

    // now traverse again the utf8 tables for transliteration data
    // and recompute the offsets:
    const xlit_table_map_t::const_iterator xlit_table_map_end =
        xlit_table_map_.end ();

    xlit_table_map_t::iterator xit = xlit_table_map_.begin ();
    for (; xit != xlit_table_map_end; ++xit) {
        for (unsigned int i = 0; i < UCHAR_MAX + 1; ++i) {
            if (xit->second.offset_table [i] & 0x80000000)
                continue;
            // add the offset for xliteration data
            xit->second.offset_table [i] += xlit_data_offset;
        }
    }

    codecvt_out.mb_cur_max = charmap_.get_mb_cur_max();

    issue_diag (I_OPENWR, false, 0, "writing %s\n", dir_name.c_str ());

    // create the stream with exceptions enabled
    std::ofstream out (dir_name.c_str(), std::ios::binary);
    out.exceptions (std::ios::failbit | std::ios::badbit);

    // write the codecvt_out structure
    out.write ((char*)&codecvt_out, sizeof codecvt_out);

    typedef codecvt_offsets_map_t::iterator off_iter_t;

    //////////////////////////////////////////////////////////////////
    // write out the multibyte to wchar_t tables
    issue_diag (I_WRITE, false, 0,
                "writing %lu multibyte tables (%lu characters)\n",
                mbchar_offs.size (), n_mbchars);

    for (off_iter_t it = mbchar_offs.begin (); it != mbchar_offs.end (); ++it) {
        for (unsigned i = 0; i <= UCHAR_MAX; ++i) {

            const unsigned off = it->second->off [i];

            out.write ((const char*)&off, sizeof off);
        }

        delete it->second;
    }

    // not needed beyond this point, clear it out
    mbchar_offs.clear ();

    //////////////////////////////////////////////////////////////////
    // write out the wchar_t to multibyte conversion tables
    issue_diag (I_WRITE, false, 0,
                "writing %lu wchar_t tables (%lu characters)\n",
                wchar_offs.size (), n_wchars);

    for (off_iter_t it = wchar_offs.begin (); it != wchar_offs.end (); ++it) {
        for (unsigned i = 0; i <= UCHAR_MAX; ++i) {

            // adjust offsets to multibyte characters (but not those
            // to other tables or invalid encodings)
            unsigned off = it->second->off [i];

            if (!(off & 0x80000000))
                off += mb_offset;

            out.write ((const char*)&off, sizeof off);
        }

        delete it->second;
    }

    // not needed beyond this point, clear it out
    wchar_offs.clear ();

    //////////////////////////////////////////////////////////////////
    // write out the UTF-8 to (libc) multibyte tables
    issue_diag (I_WRITE, false, 0,
                "writing %lu UTF-8 tables (%lu characters)\n",
                uchar_offs.size (), n_uchars);

    for (off_iter_t it = uchar_offs.begin (); it != uchar_offs.end (); ++it) {
        for (unsigned i = 0; i <= UCHAR_MAX; ++i) {

            // adjust offsets to multibyte characters (but not those
            // to other tables or invalid encodings)
            unsigned off = it->second->off [i];

            if (!(off & 0x80000000))
                off += mb_offset;

            out.write ((const char*)&off, sizeof off);
        }

        delete it->second;
    }

    // not needed beyond this point, clear it out
    uchar_offs.clear ();

    //////////////////////////////////////////////////////////////////
    // write out the transliteration UTF-8 lookup tables
    issue_diag (I_WRITE, false, 0,
                "writing transliteration table (size %lu)\n",
                xlit_table_map_.size ());

    xit = xlit_table_map_.begin ();
    for (; xit != xlit_table_map_end; ++xit) {
        const unsigned int* ptable = &xit->second.offset_table [0];
        for (unsigned int i = 0; i < UCHAR_MAX + 1; ++i, ++ptable)
            out.write ((const char*)ptable, sizeof (unsigned int));
    }

    issue_diag (I_WRITE, false, 0,
                "writing the UCS table (%lu characters)\n",
                charmap_.get_mb_cmap ().size ());

    const mb_cmap_iter n_cmap2_end = charmap_.get_mb_cmap ().end ();

    // write the locale-encoded wchar_t and the UCS4 wchar_t
    for (iter = charmap_.get_mb_cmap ().begin();
         iter != n_cmap2_end; ++iter) {
        out.write ((const char*)&iter->second, sizeof (iter->second));
        out.write ((const char*)& (charmap_.get_ucs4_cmap().find
                                   (charmap_.get_rw_cmap().find
                                    (iter->second)->second))->second,
                   sizeof (wchar_t));
    }

    // write the code_set_name string and charmap string
    out << charmap_.get_code_set_name() << std::ends
        << charmap_.get_charmap_name() << std::ends;


    // write out the narrow character strings
    for (iter = charmap_.get_mb_cmap().begin();
         iter != n_cmap2_end; ++iter) {
        out.write (iter->first.c_str(), iter->first.size() + 1);
    }

    issue_diag (I_WRITE, false, 0,
                "writing transliteration data (size %lu)\n",
                xlit_map_.size ());

    // write out the transliteration data
    xlit_map_t::const_iterator xlit_data_it = xlit_map_.begin ();
    for (; xlit_data_it != xlit_map_.end (); ++xlit_data_it) {
        std::list<std::string>::const_iterator sit =
            xlit_data_it->second.begin ();
        for (; sit != xlit_data_it->second.end (); ++sit) {
            out.write (sit->c_str (), sit->size () + 1);
        }
        out.write ("\0", 1);
    }
}