diff options
Diffstat (limited to 'tdecore/kcharsets.cpp')
-rw-r--r-- | tdecore/kcharsets.cpp | 664 |
1 files changed, 664 insertions, 0 deletions
diff --git a/tdecore/kcharsets.cpp b/tdecore/kcharsets.cpp new file mode 100644 index 000000000..b78345276 --- /dev/null +++ b/tdecore/kcharsets.cpp @@ -0,0 +1,664 @@ +/* This file is part of the KDE libraries + Copyright (C) 1999 Lars Knoll ([email protected]) + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Library General Public + License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Library General Public License for more details. + + You should have received a copy of the GNU Library General Public License + along with this library; see the file COPYING.LIB. If not, write to + the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, + Boston, MA 02110-1301, USA. +*/ +#include "kcharsets.h" + +#include "kqiodevicegzip_p.h" +#include "kentities.c" + +#include <kapplication.h> +#include <kglobal.h> +#include <klocale.h> +#include <kconfig.h> + +#include <tqfontinfo.h> +#include <tqstrlist.h> +#include <tqfontdatabase.h> +#include <kdebug.h> + +#include <tqtextcodec.h> +#include <tqmap.h> +#include <tqcstring.h> +#include <tqdir.h> +#include <tqregexp.h> + +#include <assert.h> + +static const char * const language_names[] = { + I18N_NOOP( "Other" ), + I18N_NOOP( "Arabic" ), + I18N_NOOP( "Baltic" ), + I18N_NOOP( "Central European" ), + I18N_NOOP( "Chinese Simplified" ), + I18N_NOOP( "Chinese Traditional" ), + I18N_NOOP( "Cyrillic" ), + I18N_NOOP( "Greek" ), + I18N_NOOP( "Hebrew" ), + I18N_NOOP( "Japanese" ), + I18N_NOOP( "Korean" ), + I18N_NOOP( "Thai" ), + I18N_NOOP( "Turkish" ), + I18N_NOOP( "Western European" ), + I18N_NOOP( "Tamil" ), + I18N_NOOP( "Unicode" ), + I18N_NOOP( "Northern Saami" ), + I18N_NOOP( "Vietnamese" ), + I18N_NOOP( "South-Eastern Europe" ) +}; + +// This list gives the charsets that can be used to display a file given in a certain encoding. +// The list should be in order of preference +static const char* const charsets_for_encoding[] = { + "koi8-r", + "koi8-u", + "iso 8859-1", + "iso 8859-2", + "iso 8859-3", + "iso 8859-4", + "iso 8859-5", + "iso 8859-6", + "iso 8859-7", + "iso 8859-8", + "iso 8859-8-i", + "iso 8859-9", + "iso 8859-11", + "iso 8859-13", + "iso 8859-14", + "iso 8859-15", + "iso 8859-16", + "utf8", + "utf16", + "iso-10646-ucs-2", + "cp 1250", + "cp 1251", + "cp 1252", + "cp 1253", + "cp 1254", + "cp 1255", + "cp 1256", + "cp 1257", + "cp 1258", + "ibm850", + "ibm852", + "ibm866", + "tis620", + "eucjp", + "sjis", + "jis7", + "big5", + "big5-hkscs", + "gbk", + "gb18030", + "gb2312", + "euckr", + "tscii", +// "pt 154", + "winsami2", + "cp 874", + 0 }; // extra 0 for end + +// 0 other +// 1 Arabic +// 2 Baltic +// 3 Central European +// 4 Chinese Simplified +// 5 Chinese Traditional +// 6 Cyrillic +// 7 Greek +// 8 Hebrew +// 9 Japanese +// 10 Korean +// 11 Thai +// 12 Turkish +// 13 Western European +// 14 Tamil +// 15 Unicode +// 16 Northern Sami +// 17 Vietnamese +// 18 South-Eastern Europe +// ### FIXME KDE4: the name of the encodings should mostly be uppercase +static struct LanguageForEncoding + { + const char* index; + int data; + } const language_for_encoding[] = { + { "iso 8859-1", 13 }, + { "iso 8859-15", 13 }, + { "iso 8859-14", 13 }, + { "cp 1252", 13 }, + { "ibm850", 13 }, + { "iso 8859-2", 3 }, + { "iso 8859-3", 3 }, + { "iso 8859-4", 2 }, + { "iso 8859-13", 2 }, + { "iso 8859-16", 18 }, + { "cp 1250", 3 }, + { "cp 1254", 12 }, + { "cp 1257", 2 }, + { "ibm852", 3 }, + { "koi8-r", 6 }, + { "iso 8859-5", 6 }, + { "cp 1251", 6 }, + { "koi8-u", 6 }, +// { "pt 154", 6 }, + { "ibm866", 6 }, + { "big5", 5 }, + { "big5-hkscs", 5 }, + { "gb18030", 4 }, + { "gbk", 4 }, + { "gb2312", 4 }, + { "euckr", 10 }, + { "sjis", 9 }, + { "jis7", 9 }, + { "eucjp", 9 }, + { "iso 8859-7", 7 }, + { "cp 1253", 7 }, + { "iso 8859-6", 1 }, + { "cp 1256", 1 }, + { "iso 8859-8", 8 }, + { "iso 8859-8-i", 8 }, + { "cp 1255", 8 }, + { "iso 8859-9", 12 }, + { "tis620", 11 }, + { "iso 8859-11", 11 }, + { "cp 874", 11 }, + { "cp 1258", 17 }, + { "tscii", 14 }, + { "utf8", 15 }, + { "utf16", 15 }, + { "utf7", 15 }, // ### FIXME: UTF-7 is not in Qt + { "ucs2", 15 }, + { "iso-10646-ucs-2", 15 }, + { "winsami2", 16}, + { 0, 0 } }; + +// defines some different names for codecs that are built into Qt. +static struct Builtin + { + const char* index; + const char* data; + } const builtin[] = { + { "iso-ir-111", "koi8-r" }, + { "koi8-ru", "koi8-u" }, // ### Qt 3.3 maps it to koi8-r + { "koi unified", "koi8-r" }, // ### FIXME: Qt 3.3 seems to map this to EUC-KR, so this mapping is too late :-( + // Using ISO-8859-1 for ASCII is an approximation at write + { "us-ascii", "iso 8859-1" }, + { "usascii", "iso 8859-1" }, + { "ascii", "iso 8859-1" }, + { "x-utf-8", "utf-8" }, + { "x-utf-7", "utf-7" }, // ### FIXME: UTF-7 is not in Qt + { "tqunicode-1-1-utf-7", "utf-7" }, // ### FIXME: UTF-7 is not in Qt + { "utf-16", "iso-10646-ucs-2" }, + { "utf16", "iso-10646-ucs-2" }, + { "ucs2", "iso-10646-ucs-2" }, + { "iso10646-1", "iso-10646-ucs-2" }, + { "gb18030.2000-1", "gb18030" }, + { "gb18030.2000-0", "gb18030" }, + { "gbk-0", "gbk" }, + { "gb2312.1980-0", "gbk" }, + { "gb_2312-80", "gbk" },/* this one is not official, but MS is using it :/ */ + { "x-euc-kr", "euckr" }, + { "jisx0201.1976-0", "eucjp" }, + { "jisx0208.1983-0", "eucjp" }, + { "jisx0208.1990-0", "eucjp" }, + { "jisx0208.1997-0", "eucjp" }, + { "jisx0212.1990-0", "eucjp" }, + { "jisx0213.2000-1", "eucjp" }, + { "jisx0213.2000-2", "eucjp" }, + { "windows850", "ibm850" }, + { "windows866", "ibm866" }, + { "windows1251", "cp 1251" }, + { "windows1252", "cp 1252" }, + { "windows1253", "cp 1253" }, + { "windows1254", "cp 1254" }, + { "windows1255", "cp 1255" }, + { "windows1256", "cp 1256" }, + { "windows1257", "cp 1257" }, + { "windows1258", "cp 1258" }, + { "windows-850", "ibm850" }, + { "windows-866", "ibm866" }, + { "x-windows-850", "ibm850" }, + { "x-windows-866", "ibm866" }, + { "x-windows-1250", "cp 1250" }, + { "x-windows-1251", "cp 1251" }, + { "x-windows-1252", "cp 1252" }, + { "x-windows-1253", "cp 1253" }, + { "x-windows-1254", "cp 1254" }, + { "x-windows-1255", "cp 1255" }, + { "x-windows-1256", "cp 1256" }, + { "x-windows-1257", "cp 1257" }, + { "x-windows-1258", "cp 1258" }, + { "cp819", "iso 8859-1" }, + { "cp850", "ibm850" }, + { "cp866", "ibm866" }, + { "cp-819", "iso 8859-1" }, + { "cp-850", "ibm850" }, + { "cp-866", "ibm866" }, + { "cp-1250", "cp 1250" }, + { "cp-1251", "cp 1251" }, + { "cp-1252", "cp 1252" }, + { "cp-1253", "cp 1253" }, + { "cp-1254", "cp 1254" }, + { "cp-1255", "cp 1255" }, + { "cp-1256", "cp 1256" }, + { "cp-1257", "cp 1257" }, + { "cp-1258", "cp 1258" }, + { "cp-10000", "apple roman" }, + { "x-cp-850", "ibm850" }, + { "x-cp-866", "ibm866" }, + { "x-cp-1250", "cp 1250" }, + { "x-cp-1251", "cp 1251" }, + { "x-cp-1252", "cp 1252" }, + { "x-cp-1253", "cp 1253" }, + { "x-cp-1254", "cp 1254" }, + { "x-cp-1255", "cp 1255" }, + { "x-cp-1256", "cp 1256" }, + { "x-cp-1257", "cp 1257" }, + { "x-cp-1258", "cp 1258" }, + { "x-cp-10000", "apple roman" }, + { "ibm819", "iso 8859-1" }, + { "thai-tis620", "iso 8859-11" }, + { "windows-874", "cp 874" }, + { "windows874", "cp 874" }, + { "x-windows-874", "cp 874" }, + { "x-cp-874", "cp 874" }, + { "ibm 874", "cp 874" }, + { "ibm874", "cp 874" }, // Qt4 name + { "x-ibm874", "cp 874" }, + { "ksc5601.1987-0", "euckr" }, + { "x-winsami2", "winsami2" }, + { "x-mac-roman", "apple roman" }, + { "macintosh", "apple roman" }, + { "mac", "apple roman" }, + { "csiso2022jp", "jis7" }, // See bug #77243 + { "big5-eten", "big5-hkscs" }, + { "cp950", "big5-hkscs" }, + { 0, 0 }}; + +// some different names for the encodings defined in the charmaps files. +// even though the charmap file names are all uppercase, the names are all lowercase here. +static struct Aliases + { + const char* index; + const char* data; + } const aliases[] = { + { "cp852", "ibm852" }, + { "cp-852", "ibm852" }, + { "x-cp-852", "ibm852" }, + { "windows852", "ibm852" }, + { "windows-852", "ibm852" }, + { "x-windows-852", "ibm852" }, + { 0, 0 }}; + +// some last resort hints in case the charmap file couldn't be found. This gives at least a partial conversion +// and helps making things readable. +// the name used as input here is already converted to the more canonical name as defined in the aliases array. +static struct ConversionHints + { + const char* index; + const char* data; + } const conversion_hints[] = { + { "cp1250", "iso-8859-2" }, + { "koi8-r", "iso-8859-5" }, + { "koi8-u", "koi8-r" }, + // KDE had always "CP 1251" as best fallback to PT 154. Now that Qt does not offer this encoding anymore, it is our fallback. + { "pt 154", "cp 1251" }, + { "paratype-154", "cp 1251" }, + { "pt-154", "cp 1251" }, + { 0, 0 }}; + + +// search an array of items index/data, index is const char*, data is T, find first matching index +// and return data, or return 0 +template< typename T, typename Data > +static Data kcharsets_array_search( const T* start, const char* entry ) +{ + for( const T* pos = start; + pos->index != 0; + ++pos ) + if( qstrcmp( pos->index, entry ) == 0 ) + return pos->data; + return 0; +} + + +class KCharsetsPrivate +{ +public: + KCharsetsPrivate(KCharsets* _kc) + : codecForNameDict(43, false) // case insensitive + { + db = 0; + kc = _kc; + } + ~KCharsetsPrivate() + { + delete db; + } + TQFontDatabase *db; + TQAsciiDict<TQTextCodec> codecForNameDict; + KCharsets* kc; +}; + +// -------------------------------------------------------------------------- + +KCharsets::KCharsets() +{ + d = new KCharsetsPrivate(this); +} + +KCharsets::~KCharsets() +{ + delete d; +} + +TQChar KCharsets::fromEntity(const TQString &str) +{ + TQChar res = TQChar::null; + + int pos = 0; + if(str[pos] == (QChar)'&') pos++; + + // Check for '�' or '�' sequence + if (str[pos] == (QChar)'#' && str.length()-pos > 1) { + bool ok; + pos++; + if (str[pos] == (QChar)'x' || str[pos] == (QChar)'X') { + pos++; + // '�', hexadeciaml character reference + TQString tmp(str.tqunicode()+pos, str.length()-pos); + res = tmp.toInt(&ok, 16); + } else { + // '�', decimal character reference + TQString tmp(str.tqunicode()+pos, str.length()-pos); + res = tmp.toInt(&ok, 10); + } + return res; + } + + const entity *e = kde_findEntity(str.ascii(), str.length()); + + if(!e) + { + //kdDebug( 0 ) << "unknown entity " << str <<", len = " << str.length() << endl; + return TQChar::null; + } + //kdDebug() << "got entity " << str << " = " << e->code << endl; + + return TQChar(e->code); +} + +TQChar KCharsets::fromEntity(const TQString &str, int &len) +{ + // entities are never longer than 8 chars... we start from + // that length and work backwards... + len = 8; + while(len > 0) + { + TQString tmp = str.left(len); + TQChar res = fromEntity(tmp); + if( res != (QChar)TQChar::null ) return res; + len--; + } + return TQChar::null; +} + + +TQString KCharsets::toEntity(const TQChar &ch) +{ + TQString ent; + ent.sprintf("�x%x;", ch.tqunicode()); + return ent; +} + +TQString KCharsets::resolveEntities( const TQString &input ) +{ + TQString text = input; + const TQChar *p = text.tqunicode(); + const TQChar *end = p + text.length(); + const TQChar *ampersand = 0; + bool scanForSemicolon = false; + + for ( ; p < end; ++p ) { + const TQChar ch = *p; + + if ( ch == (QChar)'&' ) { + ampersand = p; + scanForSemicolon = true; + continue; + } + + if ( ch != (QChar)';' || scanForSemicolon == false ) + continue; + + assert( ampersand ); + + scanForSemicolon = false; + + const TQChar *entityBegin = ampersand + 1; + + const uint entityLength = p - entityBegin; + if ( entityLength == 0 ) + continue; + + const TQChar entityValue = KCharsets::fromEntity( TQConstString( entityBegin, entityLength ).string() ); + if ( entityValue.isNull() ) + continue; + + const uint ampersandPos = ampersand - text.tqunicode(); + + text[ (int)ampersandPos ] = entityValue; + text.remove( ampersandPos + 1, entityLength + 1 ); + p = text.tqunicode() + ampersandPos; + end = text.tqunicode() + text.length(); + ampersand = 0; + } + + return text; +} + +TQStringList KCharsets::availableEncodingNames() +{ + TQStringList available; + for ( const char* const* pos = charsets_for_encoding; *pos; ++pos ) { + //kdDebug(0) << *charsets << " available" << endl; + available.append( TQString::tqfromLatin1( *pos )); + } + return available; +} + +TQString KCharsets::languageForEncoding( const TQString &encoding ) +{ + int lang = kcharsets_array_search< LanguageForEncoding, int > + ( language_for_encoding, encoding.latin1()); + return i18n( language_names[lang] ); +} + +TQString KCharsets::encodingForName( const TQString &descriptiveName ) +{ + const int left = descriptiveName.findRev( '(' ); + + if (left<0) // No parenthesis, so assume it is a normal encoding name + return descriptiveName.stripWhiteSpace(); + + TQString name(descriptiveName.mid(left+1)); + + const int right = name.findRev( ')' ); + + if (right<0) + return name; + + return name.left(right).stripWhiteSpace(); +} + +TQStringList KCharsets::descriptiveEncodingNames() +{ + // As we are sorting, we can directly read the array language_for_encoding + TQStringList encodings; + for ( const LanguageForEncoding* pos = language_for_encoding; pos->index; ++pos ) { + const TQString name = TQString::tqfromLatin1( pos->index ); + const TQString description = i18n( language_names[ pos->data ] ); + encodings.append( i18n("Descriptive Encoding Name", "%1 ( %2 )"). arg ( description ). arg( name ) ); + } + encodings.sort(); + return encodings; +} + +TQTextCodec *KCharsets::codecForName(const TQString &n) const +{ + bool b; + return codecForName( n, b ); +} + +TQTextCodec *KCharsets::codecForName(const TQString &n, bool &ok) const +{ + ok = true; + + TQTextCodec* codec = 0; + // dict lookup is case insensitive anyway + if((codec = d->codecForNameDict[n.isEmpty() ? "->locale<-" : n.latin1()])) + return codec; // cache hit, return + + if (n.isEmpty()) { + codec = KGlobal::locale()->codecForEncoding(); + d->codecForNameDict.replace("->locale<-", codec); + return codec; + } + + TQCString name = n.lower().latin1(); + TQCString key = name; + if (name.right(8) == "_charset") + name.truncate(name.length()-8); + + if (name.isEmpty()) { + ok = false; + return TQTextCodec::codecForName("iso8859-1"); + } + + codec = TQTextCodec::codecForName(name); + + if(codec) { + d->codecForNameDict.replace(key, codec); + return codec; + } + + // these codecs are built into Qt, but the name given for the codec is different, + // so TQTextCodec did not recognize it. + TQCString cname = kcharsets_array_search< Builtin, const char* >( builtin, name.data()); + + if(!cname.isEmpty()) + codec = TQTextCodec::codecForName(cname); + + if(codec) + { + d->codecForNameDict.replace(key, codec); + return codec; + } + + TQString dir; + { + KConfigGroupSaver cfgsav( KGlobal::config(), "i18n" ); + dir = KGlobal::config()->readPathEntry("i18ndir", TQString::tqfromLatin1("/usr/share/i18n/charmaps")); + } + + // these are codecs not included in Qt. They can be build up if the corresponding charmap + // is available in the charmap directory. + cname = kcharsets_array_search< Aliases, const char* >( aliases, name.data()); + + if(cname.isEmpty()) + cname = name; + cname = cname.upper(); + + const TQString basicName = TQString::tqfromLatin1(cname); + kdDebug() << k_funcinfo << endl << " Trying to find " << cname << " in " << dir << endl; + + TQString charMapFileName; + bool gzipped = false; + TQDir qdir(dir); + if (!qdir.exists()) { + // The directory for the charmaps does not even exist... (That is common!) + } + else if (qdir.exists(basicName, false)) { + charMapFileName = basicName; + } + else if (qdir.exists(basicName+".gz", false)) { + charMapFileName = basicName + ".gz"; + gzipped = true; + } + else { + // Check if we are asking a code page + // If yes, then check "CP99999" and "IBM99999" + // First we need to find the number of the codepage + TQRegExp regexp("^(X-)?(CP|IBM)(-| )?(0-9)+"); + if ( regexp.search(basicName) != -1) { + const TQString num = regexp.cap(4); + if (num.isEmpty()) { + // No number, not a code page (or something went wrong) + } + else if (qdir.exists("IBM"+num)) { + charMapFileName = "IBM"+num; + } + else if (qdir.exists("IBM"+num+".gz")) { + charMapFileName = "IBM"+num+".gz"; + gzipped = true; + } + else if (qdir.exists("CP"+num)) { + charMapFileName = "CP"+num; + } + else if (qdir.exists("CP"+num+".gz")) { + charMapFileName = "CP"+num+".gz"; + gzipped = true; + } + } + } + + if (gzipped && !charMapFileName.isEmpty()) { + KQIODeviceGZip gzip(dir + "/" + charMapFileName); + if (gzip.open(IO_ReadOnly)) { + kdDebug() << "Loading gzipped charset..." << endl; + codec = TQTextCodec::loadCharmap(&gzip); + gzip.close(); + } + else + kdWarning() << "Could not open gzipped charset!" << endl; + } + else if (!charMapFileName.isEmpty()) { + codec = TQTextCodec::loadCharmapFile(dir + "/" + charMapFileName); + } + + if(codec) { + d->codecForNameDict.replace(key, codec); + return codec; + } + + // this also failed, the last resort is now to take some compatibility charmap + + cname = kcharsets_array_search< ConversionHints, const char* >( conversion_hints, (const char*)name.data() ); + + if(!cname.isEmpty()) + codec = TQTextCodec::codecForName(cname); + + if(codec) { + d->codecForNameDict.replace(key, codec); + return codec; + } + + // could not assign a codec, let's return Latin1 + ok = false; + return TQTextCodec::codecForName("iso8859-1"); +} |