/* This file is part of the wvWare 2 project Copyright (C) 2001-2003 Werner Trobin This library is free software; you can redistribute it and/or modify it under the terms of the GNU Library General Public License version 2 as published by the Free Software Foundation. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public License for more details. You should have received a copy of the GNU Library General Public License along with this library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #include "textconverter.h" #include "ustring.h" #ifdef HAVE_CONFIG_H #include #endif #ifdef HAVE_ICONV_H #include #endif #ifdef HAVE_SYS_ICONV_H #include #endif #include "wvlog.h" #include using namespace wvWare; class TextConverter::Private { public: Private( const std::string& toCode, const std::string& fromCode ) : m_toCode( toCode ), m_fromCode( fromCode ), m_iconv( reinterpret_cast( -1 ) ) {} Private( const std::string& fromCode ) : #ifdef WORDS_BIGENDIAN m_toCode( "UNICODEBIG" ), #else m_toCode( "UNICODELITTLE" ), #endif m_fromCode( fromCode ), m_iconv( reinterpret_cast( -1 ) ) {} Private( U16 lid ) : #ifdef WORDS_BIGENDIAN m_toCode( "UNICODEBIG" ), #else m_toCode( "UNICODELITTLE" ), #endif m_fromCode( TextConverter::LID2Codepage( lid ) ), m_iconv( reinterpret_cast( -1 ) ) {} std::string m_toCode, m_fromCode; iconv_t m_iconv; bool m_swap; }; TextConverter::TextConverter( const std::string& toCode, const std::string& fromCode ) : d( new Private( toCode, fromCode ) ) { open(); } TextConverter::TextConverter( const std::string& fromCode ) : d( new Private( fromCode ) ) { open(); } TextConverter::TextConverter( U16 lid ) : d( new Private( lid ) ) { open(); } TextConverter::~TextConverter() { close(); delete d; } bool TextConverter::isOk() const { return d->m_iconv != reinterpret_cast( -1 ); } void TextConverter::setToCode( const std::string& toCode ) { d->m_toCode = toCode; close(); open(); } std::string TextConverter::toCode() const { return d->m_toCode; } void TextConverter::setFromCode( const std::string& fromCode ) { d->m_fromCode = fromCode; close(); open(); } std::string TextConverter::fromCode() const { return d->m_fromCode; } UString TextConverter::convert( const std::string& input ) const { return convert( input.c_str(), input.size() ); } UString TextConverter::convert( const char* input, unsigned int length ) const { if ( !isOk() ) { wvlog << "Error: I don't have any open converter." << std::endl; return UString(); } // WinWord doesn't have multi-byte characters encoded in compressed-unicode // sections, right? UChar *output = new UChar[ length ]; char *p_output = reinterpret_cast( output ); size_t outputLen = length << 1; const char* p_input = input; size_t inputLen = length; if ( static_cast( -1 ) == iconv( d->m_iconv, const_cast( &p_input ), &inputLen, &p_output, &outputLen ) ) { delete [] output; // If we got more than one character, try to return as much text as possible... // To convert the text with as few iconv calls as possible we are using a divide // and conquer approach. if ( length > 1 ) { UString ustring( convert( input, length / 2 ) ); ustring += convert( input + length / 2, ( length + 1 ) / 2 ); return ustring; } else { wvlog << "Error: The conversion was not successful: " << errno << std::endl; return UString(); } } if ( outputLen != 0 || ( outputLen & 0x00000001 ) == 1 ) wvlog << "Strange, got an outputLen of " << outputLen << std::endl; UString ustring( output, length - ( outputLen >> 1 ), true ); delete [] output; return ustring; } U16 TextConverter::locale2LID( U8 nLocale ) { switch ( nLocale ) { case 134: // Chinese Simplified return 0x804; case 136: // Chinese Traditional return 0x404; // Add Japanese, Korean and whatever nLocale you see fit. default: return 0x0; } } const char* TextConverter::LID2lang( U16 lid ) { switch ( lid ) { case 0x0405: return "cs-CZ"; case 0x0406: return "da-DK"; case 0x0807: /* swiss german */ case 0x0407: /* german */ return "de-DE"; case 0x0809: /* british english */ return "en-GB"; case 0x0c09: return "en-AU"; case 0x0413: /* dutch */ return "da-NL"; /* netherlands */ case 0x040a: /* castillian */ case 0x080a: /* mexican */ return "es-ES"; case 0x040b: return "fi-FI"; case 0x040c: return "fr-FR"; case 0x0410: return "it-IT"; case 0x040d: /* hebrew */ return "iw-IL"; case 0x0416: /* brazilian */ case 0x0816: /* portugese */ return "pt-PT"; case 0x0419: return "ru-RU"; case 0x041d: return "sv-SE"; case 0x0400: return "-none-"; case 0x0409: default: return "en-US"; } } const char* TextConverter::LID2Codepage( U16 lid ) { static const char *cp874 = "CP874"; static const char *cp932 = "CP932"; static const char *cp936 = "CP936"; static const char *cp949 = "CP949"; static const char *cp950 = "CP950"; static const char *cp1250 = "CP1250"; static const char *cp1251 = "CP1251"; static const char *cp1252 = "CP1252"; static const char *cp1253 = "CP1253"; static const char *cp1254 = "CP1254"; static const char *cp1255 = "CP1255"; static const char *cp1256 = "CP1256"; static const char *cp1257 = "CP1257"; static const char *unknown = "not known"; // Work around spec bugs. Thomas Zander's documents had very // weird lid codes. Mac Word? if ( lid < 999 ) lid = fixLID( lid ); switch ( lid ) { case 0x0401: /*Arabic*/ return cp1256; case 0x0402: /*Bulgarian*/ return cp1251; case 0x0403: /*Catalan*/ return cp1252; case 0x0404: /*Traditional Chinese*/ return cp950; case 0x0804: /*Simplified Chinese*/ return cp936; case 0x0405: /*Czech*/ return cp1250; case 0x0406: /*Danish*/ return cp1252; case 0x0407: /*German*/ return cp1252; case 0x0807: /*Swiss German*/ return cp1252; case 0x0408: /*Greek*/ return cp1253; case 0x0409: /*U.S. English*/ return cp1252; case 0x0809: /*U.K. English*/ return cp1252; case 0x0c09: /*Australian English*/ return cp1252; case 0x040a: /*Castilian Spanish*/ return cp1252; case 0x080a: /*Mexican Spanish*/ return cp1252; case 0x0c0a: /*Traditional Spanish*/ return cp1252; // TBD: Undocumented! case 0x040b: /*Finnish*/ return cp1252; case 0x040c: /*French*/ return cp1252; case 0x080c: /*Belgian French*/ return cp1252; case 0x0c0c: /*Canadian French*/ return cp1252; case 0x100c: /*Swiss French*/ return cp1252; case 0x040d: /*Hebrew*/ return cp1255; case 0x040e: /*Hungarian*/ return cp1250; case 0x040f: /*Icelandic*/ return cp1252; case 0x0410: /*Italian*/ return cp1252; case 0x0810: /*Swiss Italian*/ return cp1252; case 0x0411: /*Japanese*/ return cp932; case 0x0412: /*Korean*/ return cp949; case 0x0413: /*Dutch*/ return cp1252; case 0x0813: /*Belgian Dutch*/ return cp1252; case 0x0414: /*Norwegian - Bokmal*/ return cp1252; case 0x0814: /*Norwegian - Nynorsk*/ return cp1252; case 0x0415: /*Polish*/ return cp1250; case 0x0416: /*Brazilian Portuguese*/ return cp1252; case 0x0816: /*Portuguese*/ return cp1252; case 0x0417: /*Rhaeto-Romanic*/ return cp1252; case 0x0418: /*Romanian*/ return cp1252; case 0x0419: /*Russian*/ return cp1251; case 0x041a: /*Croato-Serbian (Latin)*/ return cp1250; case 0x081a: /*Serbo-Croatian (Cyrillic) */ return cp1252; case 0x041b: /*Slovak*/ return cp1250; case 0x041c: /*Albanian*/ return cp1251; case 0x041d: /*Swedish*/ return cp1252; case 0x041e: /*Thai*/ return cp874; case 0x041f: /*Turkish*/ return cp1254; case 0x0420: /*Urdu*/ return cp1256; case 0x0421: /*Bahasa*/ return cp1256; case 0x0422: /*Ukrainian*/ return cp1251; case 0x0423: /*Byelorussian*/ return cp1251; case 0x0424: /*Slovenian*/ return cp1250; case 0x0425: /*Estonian*/ return cp1257; case 0x0426: /*Latvian*/ return cp1257; case 0x0427: /*Lithuanian*/ return cp1257; case 0x0429: /*Farsi*/ return cp1256; case 0x042D: /*Basque*/ return cp1252; case 0x042F: /*Macedonian*/ return cp1251; case 0x0436: /*Afrikaans*/ return cp1252; case 0x043E: /*Malaysian*/ return cp1251; default: return unknown; } } void TextConverter::close() { if ( d->m_iconv != reinterpret_cast( -1 ) ) iconv_close( d->m_iconv ); d->m_iconv = reinterpret_cast( -1 ); } void TextConverter::open() { if ( d->m_iconv != reinterpret_cast( -1 ) ) { wvlog << "Warning: Do you really want to get rid of the current converter?" << std::endl; close(); } #ifdef WORDS_BIGENDIAN if ( d->m_toCode != "UNICODEBIG" ) wvlog << "Warning: Do you really want to do convert to something else than UNICODEBIG?" << std::endl; #else if ( d->m_toCode != "UNICODELITTLE" ) wvlog << "Warning: Do you really want to do convert to something else than UNICODELITTLE?" << std::endl; #endif if ( d->m_fromCode == "not known" ) wvlog << "Warning: We don't know the current charset you want to convert from!" << std::endl; if ( !d->m_toCode.empty() && !d->m_fromCode.empty() ) d->m_iconv = iconv_open( d->m_toCode.c_str(), d->m_fromCode.c_str() ); } U16 TextConverter::fixLID( U16 nLocale ) { // I have no idea which code these nLocale numbers are, but the // files in ftp://dkuug.dk/i18n/charmaps look promising. If another // one of those files turns out to be right I'll add the remaining // mappings. switch( nLocale ) { case 0x13: /* Dutch */ return 0x0413; default: return nLocale; } }