352 lines
12 KiB
C++
352 lines
12 KiB
C++
/***************************************************************************
|
|
*
|
|
* iconv.cpp - Win32 implementation of the POSIX iconv facility
|
|
*
|
|
* $Id: iconv.cpp 550991 2007-06-26 23:58:07Z sebor $
|
|
*
|
|
***************************************************************************
|
|
*
|
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
* contributor license agreements. See the NOTICE file distributed
|
|
* with this work for additional information regarding copyright
|
|
* ownership. The ASF licenses this file to you under the Apache
|
|
* License, Version 2.0 (the "License"); you may not use this file
|
|
* except in compliance with the License. You may obtain a copy of
|
|
* the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
|
* implied. See the License for the specific language governing
|
|
* permissions and limitations under the License.
|
|
*
|
|
* Copyright 1994-2006 Rogue Wave Software.
|
|
*
|
|
**************************************************************************/
|
|
|
|
#if defined (_WIN32) || defined (_WIN64)
|
|
|
|
#include <errno.h>
|
|
#include <windows.h>
|
|
|
|
|
|
typedef int iconv_t;
|
|
|
|
iconv_t iconv_open (const char*, const char*);
|
|
size_t iconv (iconv_t, char**, size_t*, char**, size_t*);
|
|
int iconv_close (iconv_t);
|
|
|
|
|
|
iconv_t iconv_open (const char *from_code, const char *to_code)
|
|
{
|
|
static const struct {
|
|
int code;
|
|
const char *name;
|
|
} pages[] = {
|
|
{ 37, "EBCDIC-US" },
|
|
{ 437, "OEM - United States" },
|
|
{ 500, "IBM EBCDIC - International" },
|
|
{ 708, "Arabic - ASMO 708" },
|
|
{ 709, "Arabic - ASMO 449+, BCON V4" },
|
|
{ 710, "Arabic - Transparent Arabic" },
|
|
{ 720, "Arabic - Transparent ASMO" },
|
|
{ 737, "OEM - Greek (formerly 437G)" },
|
|
{ 775, "OEM - Baltic" },
|
|
{ 850, "OEM - Multilingual Latin I" },
|
|
{ 852, "OEM - Latin II" },
|
|
{ 855, "OEM - Cyrillic (primarily Russian)" },
|
|
{ 857, "OEM - Turkish" },
|
|
{ 858, "OEM - Multlingual Latin I + Euro symbol" },
|
|
{ 860, "OEM - Portuguese" },
|
|
{ 861, "OEM - Icelandic" },
|
|
{ 862, "OEM - Hebrew" },
|
|
{ 863, "OEM - Canadian-French" },
|
|
{ 864, "OEM - Arabic" },
|
|
{ 865, "OEM - Nordic" },
|
|
{ 866, "OEM - Russian" },
|
|
{ 869, "OEM - Modern Greek" },
|
|
{ 870, "IBM EBCDIC - Multilingual/ROECE (Latin-2)" },
|
|
{ 874, "ANSI/OEM - Thai (same as 28605, ISO 8859-15)" },
|
|
{ 875, "IBM EBCDIC - Modern Greek" },
|
|
{ 932, "Shift_JIS" },
|
|
{ 936, "ANSI/OEM - Simplified Chinese (PRC, Singapore)" },
|
|
{ 949, "ANSI/OEM - Korean (Unified Hangeul Code)" },
|
|
{ 950, "ANSI/OEM - Traditional Chinese" },
|
|
{ 1026, "IBM EBCDIC - Turkish (Latin-5)" },
|
|
{ 1047, "IBM EBCDIC - Latin 1/Open System" },
|
|
{ 1140, "EBCDIC-CA-FR@EURO" },
|
|
{ 1141, "EBCDIC-AT-DE@EURO" },
|
|
{ 1142, "EBCDIC-DK-NO@EURO" },
|
|
{ 1143, "EBCDIC-FI-SE@EURO" },
|
|
{ 1144, "EBCDIC-IT@EURO" },
|
|
{ 1145, "EBCDIC-ES-A@EURO" },
|
|
{ 1146, "EBCDIC-UK@EURO" },
|
|
{ 1147, "EBCDIC-FR@EURO" },
|
|
{ 1148, "IBM EBCDIC - International (500 + Euro symbol)" },
|
|
{ 1149, "EBCDIC-IS-FRISS@EURO" },
|
|
{ 1200, "UCS-2-LE" },
|
|
{ 1201, "UCS-2-BE" },
|
|
{ 1251, "ANSI - Cyrillic" },
|
|
{ 1252, "ANSI - Latin I" },
|
|
{ 1253, "ANSI - Greek" },
|
|
{ 1254, "ANSI - Turkish" },
|
|
{ 1255, "ANSI - Hebrew" },
|
|
{ 1256, "ANSI - Arabic" },
|
|
{ 1257, "ANSI - Baltic" },
|
|
{ 1258, "ANSI/OEM - Vietnamese" },
|
|
{ 1361, "Korean (Johab)" },
|
|
{ 10000, "MAC - Roman" },
|
|
{ 10001, "MAC-JP" },
|
|
{ 10002, "MAC - Traditional Chinese (Big5)" },
|
|
{ 10003, "MAC-KR" },
|
|
{ 10004, "MAC-AR" },
|
|
{ 10005, "MAC - Hebrew" },
|
|
{ 10006, "MAC - Greek I" },
|
|
{ 10007, "MAC-CYRILLIC" },
|
|
{ 10008, "MAC - Simplified Chinese (GB 2312)" },
|
|
{ 10010, "MAC - Romania" },
|
|
{ 10017, "MAC - Ukraine" },
|
|
{ 10021, "MAC - Thai" },
|
|
{ 10029, "MAC - Latin II" },
|
|
{ 10079, "MAC - Icelandic" },
|
|
{ 10081, "MAC - Turkish" },
|
|
{ 10082, "MAC - Croatia" },
|
|
{ 12000, "UCS-4-LE" },
|
|
{ 12001, "UCS-4-BE" },
|
|
{ 20000, "CNS - Taiwan" },
|
|
{ 20001, "TCA - Taiwan" },
|
|
{ 20002, "Eten - Taiwan" },
|
|
{ 20003, "IBM5550 - Taiwan" },
|
|
{ 20004, "TeleText - Taiwan" },
|
|
{ 20005, "Wang - Taiwan" },
|
|
{ 20105, "IA5 IRV International Alphabet No. 5 (7-bit)" },
|
|
{ 20106, "IA5 German (7-bit)" },
|
|
{ 20107, "IA5 Swedish (7-bit)" },
|
|
{ 20108, "IA5 Norwegian (7-bit)" },
|
|
{ 20127, "ANSI_X3.4-1968" },
|
|
{ 20261, "T.61" },
|
|
{ 20269, "ISO 6937 Non-Spacing Accent" },
|
|
{ 20273, "EBCDIC-DE" },
|
|
{ 20277, "EBCDIC-DK-NO" },
|
|
{ 20278, "EBCDIC-FI-SE" },
|
|
{ 20280, "EBCDIC-IT" },
|
|
{ 20284, "EBCDIC-ES-A" },
|
|
{ 20285, "EBCDIC-UK" },
|
|
{ 20290, "EBCDIC-JP" },
|
|
{ 20297, "EBCDIC-FR" },
|
|
{ 20420, "EBCDIC-AR" },
|
|
{ 20423, "IBM EBCDIC - Greek" },
|
|
{ 20424, "IBM EBCDIC - Hebrew" },
|
|
{ 20833, "EBCDIC-KR" },
|
|
{ 20838, "IBM EBCDIC - Thai" },
|
|
{ 20866, "Russian - KOI8-R" },
|
|
{ 20871, "IBM EBCDIC - Icelandic" },
|
|
{ 20880, "IBM EBCDIC - Cyrillic (Russian)" },
|
|
{ 20905, "IBM EBCDIC - Turkish" },
|
|
{ 20924, "IBM EBCDIC - Latin-1/Open System (1047 + Euro symbol)" },
|
|
{ 20932, "JIS X 0208-1990 & 0121-1990" },
|
|
{ 20936, "GB2312" },
|
|
{ 21025, "IBM EBCDIC - Cyrillic (Serbian, Bulgarian)" },
|
|
{ 21027, "Extended Alpha Lowercase" },
|
|
{ 21866, "Ukrainian (KOI8-U)" },
|
|
{ 28591, "ISO-8859-1" },
|
|
{ 28592, "ISO-8859-2" },
|
|
{ 28593, "ISO-8859-3" },
|
|
{ 28594, "ISO-8859-4" },
|
|
{ 28595, "ISO-8859-5" },
|
|
{ 28596, "ISO-8859-6" },
|
|
{ 28597, "ISO-8859-7" },
|
|
{ 28598, "ISO-8859-8" },
|
|
{ 28599, "ISO-8859-95" },
|
|
{ 28605, "ISO-8859-15" },
|
|
{ 29001, "Europa 3" },
|
|
{ 38598, "ISO-8859-8 Hebrew" },
|
|
{ 50220, "ISO-2022 Japanese with no halfwidth Katakana" },
|
|
{ 50221, "ISO-2022-JP" },
|
|
{ 50222, "ISO-2022 Japanese JIS X 0201-1989" },
|
|
{ 50225, "ISO-2022-KR" },
|
|
{ 50227, "ISO-2022 Simplified Chinese" },
|
|
{ 50229, "ISO-2022 Traditional Chinese" },
|
|
{ 50930, "Japanese (Katakana) Extended" },
|
|
{ 50931, "US/Canada and Japanese" },
|
|
{ 50933, "Korean Extended and Korean" },
|
|
{ 50935, "Simplified Chinese Extended and Simplified Chinese" },
|
|
{ 50936, "Simplified Chinese" },
|
|
{ 50937, "US/Canada and Traditional Chinese" },
|
|
{ 50939, "Japanese (Latin) Extended and Japanese" },
|
|
{ 51932, "EUC-JP" },
|
|
{ 51936, "EUC - Simplified Chinese" },
|
|
{ 51949, "EUC-KR" },
|
|
{ 51950, "EUC - Traditional Chinese" },
|
|
{ 52936, "HZ-GB2312 Simplified Chinese" },
|
|
{ 54936, "Windows XP: GB18030 Simplified Chinese (4 Byte)" },
|
|
{ 57002, "ISCII Devanagari 57003 ISCII Bengali" },
|
|
{ 57004, "ISCII Tamil" },
|
|
{ 57005, "ISCII Telugu" },
|
|
{ 57006, "ISCII Assamese" },
|
|
{ 57007, "ISCII Oriya" },
|
|
{ 57008, "ISCII Kannada" },
|
|
{ 57009, "ISCII Malayalam" },
|
|
{ 57010, "ISCII Gujarati" },
|
|
{ 57011, "ISCII Punjabi" },
|
|
{ 65000, "UTF-7" },
|
|
{ 65001, "UTF-8" }
|
|
};
|
|
|
|
iconv_t cd = 0;
|
|
|
|
for (size_t i = 0; ; ++i) {
|
|
|
|
if (i == sizeof pages / sizeof *pages) {
|
|
cd = -1;
|
|
break;
|
|
}
|
|
|
|
if (!strcmp (from_code, pages [i].name)) {
|
|
cd |= pages [i].code << 16;
|
|
|
|
if (cd & 0x0000ffff)
|
|
break;
|
|
}
|
|
|
|
if (!strcmp (to_code, pages [i].name)) {
|
|
cd |= pages [i].code & 0xffff;
|
|
|
|
if (cd & 0xffff0000)
|
|
break;
|
|
}
|
|
}
|
|
|
|
// validate code pages
|
|
if ( -1 == cd
|
|
|| !MultiByteToWideChar (cd & 0x0000ffff, 0, "", 1, 0, 0)
|
|
|| !MultiByteToWideChar ((cd >> 16) & 0x0000ffff, 0, "", 1, 0, 0)) {
|
|
// [EINVAL]
|
|
// The conversion specified by fromcode and tocode
|
|
// is not supported by the implementation.
|
|
errno = EINVAL;
|
|
cd = -1;
|
|
}
|
|
|
|
return cd;
|
|
}
|
|
|
|
|
|
size_t iconv (iconv_t cd,
|
|
char **inbuf, size_t *inbytesleft,
|
|
char **outbuf, size_t *outbytesleft)
|
|
{
|
|
const int fromcode = (cd >> 16) & 0x0000ffff;
|
|
const int tocode = cd & 0x0000ffff;
|
|
|
|
if (0x0000ffff == fromcode || 0x0000ffff == tocode) {
|
|
errno = EBADF;
|
|
return size_t (-1);
|
|
}
|
|
|
|
wchar_t wbuf [256];
|
|
|
|
// allocate a sufficient amount of storage to conver the input
|
|
// buffer to wide character assuming, pessimistically, that
|
|
// each byte converts to one wide character
|
|
wchar_t *pwc = *inbytesleft < sizeof wbuf / sizeof *wbuf ?
|
|
wbuf : new wchar_t [*inbytesleft];
|
|
|
|
// convert the contents of the narrow input buffer to wide characters
|
|
// `nwout' -- number of wide chars successfully produced by the call
|
|
const int nwout =
|
|
MultiByteToWideChar (fromcode, MB_ERR_INVALID_CHARS,
|
|
*inbuf, int (*inbytesleft),
|
|
pwc, int (*inbytesleft));
|
|
|
|
if (!nwout) {
|
|
if (pwc != wbuf)
|
|
delete[] pwc;
|
|
|
|
const int error = GetLastError ();
|
|
errno = ERROR_INSUFFICIENT_BUFFER == error || !error ? E2BIG : EILSEQ;
|
|
return size_t (-1);
|
|
}
|
|
|
|
// convert the contents of wide character buffer into the narrow
|
|
// character buffer
|
|
// `nnout' -- number of narrow chars successfully produced by the
|
|
// call
|
|
const int nnout =
|
|
WideCharToMultiByte (tocode, 0,
|
|
pwc, nwout,
|
|
*outbuf, int (*outbytesleft),
|
|
0, 0);
|
|
if (!nnout || !*outbytesleft) {
|
|
if (pwc != wbuf)
|
|
delete[] pwc;
|
|
|
|
const int error = GetLastError ();
|
|
errno = ERROR_INSUFFICIENT_BUFFER == error || !error ? E2BIG : EILSEQ;
|
|
return size_t (-1);
|
|
}
|
|
|
|
// compute the number of wide characters consumed by second
|
|
// conversion
|
|
// `nwin' -- number of wide chars consumed by the call above
|
|
const int nwin =
|
|
MultiByteToWideChar (tocode, 0, *outbuf, nnout, 0, 0);
|
|
|
|
if (!nwin) {
|
|
if (pwc != wbuf)
|
|
delete[] pwc;
|
|
|
|
const int error = GetLastError ();
|
|
errno = ERROR_INSUFFICIENT_BUFFER == error || !error ? E2BIG : EILSEQ;
|
|
return size_t (-1);
|
|
}
|
|
|
|
// finally, compute the number of narrow characters in the source
|
|
// encoding corresponding to the number of narrow characters in
|
|
// the destrination encoding
|
|
// `nnin' -- number of narrow chars consumed by the first call
|
|
const int nnin =
|
|
WideCharToMultiByte (fromcode, 0, pwc, nwin, 0, 0, 0, 0);
|
|
if (pwc != wbuf)
|
|
delete[] pwc;
|
|
|
|
if (!nnin) {
|
|
const int error = GetLastError ();
|
|
errno = ERROR_INSUFFICIENT_BUFFER == error || !error ? E2BIG : EILSEQ;
|
|
return size_t (-1);
|
|
}
|
|
|
|
// advance buffers to the first character to convert
|
|
*inbuf += nnin;
|
|
*outbuf += nnout;
|
|
|
|
// decrement the size of each buffer
|
|
*inbytesleft -= nnin;
|
|
*outbytesleft -= nnout;
|
|
|
|
if (*inbytesleft) {
|
|
errno = E2BIG;
|
|
return size_t (-1);
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
|
|
int iconv_close (iconv_t cd)
|
|
{
|
|
const int fromcode = (cd >> 16) & 0x0000ffff;
|
|
const int tocode = cd & 0x0000ffff;
|
|
|
|
if (0x0000ffff == fromcode || 0x0000ffff == tocode) {
|
|
errno = EBADF;
|
|
return -1;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
#endif // _WIN{32,64}
|