265 lines
8.3 KiB
C++
265 lines
8.3 KiB
C++
/***************************************************************************
|
|
*
|
|
* charmap.h
|
|
*
|
|
* $Id: charmap.h 580483 2007-09-28 20:55:52Z sebor $
|
|
*
|
|
***************************************************************************
|
|
*
|
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
* contributor license agreements. See the NOTICE file distributed
|
|
* with this work for additional information regarding copyright
|
|
* ownership. The ASF licenses this file to you under the Apache
|
|
* License, Version 2.0 (the "License"); you may not use this file
|
|
* except in compliance with the License. You may obtain a copy of
|
|
* the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
|
* implied. See the License for the specific language governing
|
|
* permissions and limitations under the License.
|
|
*
|
|
* Copyright 2001-2007 Rogue Wave Software, Inc.
|
|
*
|
|
**************************************************************************/
|
|
|
|
|
|
#ifndef _RWSTD_CHARMAP_H_INCLUDED
|
|
#define _RWSTD_CHARMAP_H_INCLUDED
|
|
|
|
#include <string>
|
|
#include <list>
|
|
#include <map>
|
|
#include <set>
|
|
|
|
#ifndef _RWSTD_NO_ICONV
|
|
# include <iconv.h>
|
|
#endif // _RWSTD_NO_ICONV
|
|
|
|
|
|
#include "scanner.h"
|
|
|
|
class Charmap
|
|
{
|
|
public:
|
|
static const char* const portable_charset [];
|
|
|
|
Charmap(const char* /*corresponding C library locale*/,
|
|
const char* /*filename*/,
|
|
bool /*is utf8 encoding?*/,
|
|
bool /*create_forward_charmaps*/,
|
|
bool /*create_reverse_charmaps*/,
|
|
bool /*use UCS4 internally*/);
|
|
|
|
// returns the narrow character map which maps a symbolic character
|
|
// name to its narrow character value
|
|
const std::map<std::string, unsigned char>& get_n_cmap() const {
|
|
return n_cmap_;
|
|
}
|
|
|
|
// returns the reverse narrow character map which maps a narrow
|
|
// character value to its symbolic name
|
|
const std::map<unsigned char, std::string>& get_rn_cmap() const {
|
|
return rn_cmap_;
|
|
}
|
|
|
|
// returns the wide character map which maps a symbolic character
|
|
// name to its wide character value
|
|
const std::map<std::string, wchar_t>& get_w_cmap() const {
|
|
return w_cmap_;
|
|
}
|
|
|
|
// returns the reverse wide character map which maps a wide
|
|
// character value to its symbolic name
|
|
const std::map<wchar_t, std::string>& get_rw_cmap() const {
|
|
return rw_cmap_;
|
|
}
|
|
|
|
// returns the multibyte character map which maps a multibyte
|
|
// character to its corresponding wide character value
|
|
const std::map<std::string, wchar_t>& get_mb_cmap() const {
|
|
return mb_cmap_;
|
|
}
|
|
|
|
// returns the reverse multibyte character map which maps a wide
|
|
// character value to its corresponding multibyte character
|
|
const std::map<wchar_t, std::string>& get_rmb_cmap() const {
|
|
return rmb_cmap_;
|
|
}
|
|
|
|
// get the string value map
|
|
const std::list<std::string>& get_symnames_list() const {
|
|
return symnames_list_;
|
|
}
|
|
|
|
const std::map <std::string, wchar_t>& get_ucs4_cmap () const {
|
|
return ucs4_cmap_;
|
|
}
|
|
|
|
const std::map <wchar_t, std::string>& get_rucs4_cmap () const {
|
|
return rucs4_cmap_;
|
|
}
|
|
|
|
// return the value of mb_cur_max
|
|
int get_mb_cur_max() const {
|
|
return mb_cur_max_;
|
|
}
|
|
|
|
// return the name of the codeset
|
|
const std::string& get_code_set_name () const {
|
|
return code_set_name_;
|
|
}
|
|
|
|
// return the name of the character map
|
|
std::string get_charmap_name () const;
|
|
|
|
// return the full path to the charmap
|
|
std::string get_full_charmap_name () const {
|
|
return charmap_name_;
|
|
}
|
|
|
|
// convert the externally encoded string to the internal encoding
|
|
bool convert_to_wc (const std::string&, const std::string&, wchar_t&);
|
|
|
|
// convert the externally encoded string to UCS
|
|
bool convert_to_ucs (const std::string&, const std::string&, wchar_t&);
|
|
|
|
// convert the externally encoded string to UCS
|
|
wchar_t convert_sym_to_ucs (const std::string&) const;
|
|
|
|
|
|
// get the number of bytes in a single multi-byte character
|
|
std::size_t mbcharlen (const std::string&) const;
|
|
|
|
// convert the first byte in the multi-byte character to an unsigned char
|
|
unsigned char convert_char (const char*, const char** = 0) const;
|
|
|
|
unsigned char get_largest_nchar () const;
|
|
|
|
// increments the wide character value to the next encoded character
|
|
// in the current codeset; returns the incremented value or -1 on
|
|
// error
|
|
wchar_t increment_wchar (wchar_t) const;
|
|
|
|
private:
|
|
|
|
// processes characters implicitly defined by an ellipsis denoted
|
|
// by two explicitly defined characters; returns the number of
|
|
// characters in the range, -1 on error
|
|
std::size_t process_ellipsis (const Scanner::token_t&, int);
|
|
|
|
// process the charmap file making the necessary mappings in the cmaps
|
|
void process_chars();
|
|
|
|
// increment the encoded multi byte character argument
|
|
bool increment_encoding (std::string&);
|
|
|
|
// verify that all the characters in the portable character set
|
|
// are defined in the character map
|
|
void verify_portable_charset () const;
|
|
|
|
#ifndef _RWSTD_NO_ICONV
|
|
// open the iconv descriptor to convert to utf8
|
|
iconv_t open_iconv_to_utf8 () const;
|
|
#endif // _RWSTD_NO_ICONV
|
|
|
|
// convert a human-readable encoding of a character
|
|
// to its raw multibyte character representation
|
|
std::string encoding_to_mbchar (const std::string&) const;
|
|
|
|
// convert a multi-byte string to a utf8 multi-byte string
|
|
char* convert_to_utf8 (const char *inbuf, std::size_t inbuf_s,
|
|
char *outbuf, std::size_t outbuf_s) const;
|
|
|
|
#ifndef _RWSTD_NO_ICONV
|
|
# ifndef _RWSTD_NO_ISO_10646_WCHAR_T
|
|
|
|
// open the iconv descriptor to convert from utf8 to the external encoding
|
|
iconv_t open_iconv_to_ext ();
|
|
|
|
# endif // _RWSTD_NO_ISO_10646_WCHAR_T
|
|
#endif // _RWSTD_NO_ICONV
|
|
|
|
// add the symbolic name of a character and the raw multibyte
|
|
// character corresponding to it to the character maps
|
|
void add_to_cmaps (const std::string&,
|
|
const std::string&,
|
|
bool = false);
|
|
|
|
// the scanner used to process the charmap file
|
|
Scanner scanner_;
|
|
|
|
// the name of the codeset
|
|
std::string code_set_name_;
|
|
|
|
#if defined (_MSC_VER)
|
|
int codepage_;
|
|
#endif // _MSC_VER
|
|
|
|
// n_cmap maps the symbolic name to a narrow character value
|
|
// rn_cmap does the opposite
|
|
std::map <std::string, unsigned char> n_cmap_;
|
|
std::map <unsigned char, std::string> rn_cmap_;
|
|
|
|
// mb_cmap maps a multibyte character representation to its
|
|
// corresponding wide character value
|
|
// rmb_cmap does the opposite
|
|
std::map <std::string, wchar_t> mb_cmap_;
|
|
std::map <wchar_t, std::string> rmb_cmap_;
|
|
|
|
typedef std::map <wchar_t, std::string>::const_iterator rmb_cmap_iter;
|
|
typedef std::map <std::string, wchar_t>::const_iterator mb_cmap_iter;
|
|
|
|
// w_cmap maps the symbolic name to a wide character value
|
|
// rw_cmap does exactly the opposite
|
|
std::map <std::string, wchar_t> w_cmap_;
|
|
std::map <wchar_t, std::string> rw_cmap_;
|
|
|
|
// ucs4_cmap maps the symbolic name to the UCS4 value for that name
|
|
std::map <std::string, wchar_t> ucs4_cmap_;
|
|
std::map <wchar_t, std::string> rucs4_cmap_;
|
|
|
|
// the number of bytes in the largest multi-byte value
|
|
int mb_cur_max_;
|
|
|
|
#ifndef _RWSTD_NO_ICONV
|
|
// the iconv file descriptor that converts to utf8
|
|
iconv_t ic_to_utf8_;
|
|
|
|
// the iconv file descriptor that converts from utf8 to external
|
|
iconv_t ic_to_ext_;
|
|
#endif // _RWSTD_NO_ICONV
|
|
|
|
// the name of the character map file
|
|
std::string charmap_name_;
|
|
|
|
// the name of the C library locale with same encoding
|
|
std::string Clocale_;
|
|
|
|
unsigned char largest_nchar_;
|
|
|
|
// are we in the utf8 encoding?
|
|
bool in_utf8_;
|
|
|
|
// should we create the forward character maps
|
|
bool forward_maps;
|
|
|
|
// should we create the reverse character maps
|
|
bool reverse_maps;
|
|
|
|
// should we use UCS4 as the internal representation
|
|
bool UCS4_internal_;
|
|
|
|
// list of all known symbolic character names
|
|
std::list<std::string> symnames_list_;
|
|
|
|
Scanner::token_t next;
|
|
};
|
|
|
|
|
|
#endif // _RWSTD_CHARMAP_H_INCLUDED
|
|
|