/* This file was taken from the KDE 4.x libraries and backported to TQt 3. Copyright (C) 1999 Lars Knoll (knoll@kde.org) Copyright (C) 2003 Dirk Mueller (mueller@kde.org) Copyright (C) 2003 Apple Computer, Inc. Copyright (C) 2007 Nick Shaforostoff (shafff@ukr.net) This library is free software; you can redistribute it and/or modify it under the terms of the GNU Library General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public License for more details. You should have received a copy of the GNU Library General Public License along with this library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ //---------------------------------------------------------------------------- // // decoder for input stream #include "encodingdetector.h" #undef DECODE_DEBUG //#define DECODE_DEBUG #define MAX_BUFFER 16*1024 #include <assert.h> #include <stdlib.h> #include "encodingdetector_ja_p.h" #include <tqregexp.h> #include <tqtextcodec.h> #include <tdeglobal.h> #include <kcharsets.h> #include <kdebug.h> #include <tdelocale.h> #include <ctype.h> // The following table was taken from libpango 1.19.3 and slightly modified. // Multiple scripts per language were removed and the entries were reordered so // that simple substring matching will work. For example, bam was put before ba // so that the first match will be likely the right match. Otherwise "ba" would // match "bam" but we would have to search on to find "bam" which is what we want. // The original file is called pango-script-lang-table.h /* pango-script-lang-table.h: * * Generated by gen-script-for-lang-new.c * Date: 2007-10-26 * Source: fontconfig-2.4.91 * * Do not edit. // I did. Sue me ;) */ typedef struct _PangoScriptForLang { const char lang[6]; EncodingDetector::AutoDetectScript scripts[1]; } PangoScriptForLang; //Unfortunately EncodingDetector does not know all scripts that Pango knows. //Also, using EncodingDetector::CentralEuropean for the appropriate countries //might give better results in some cases. //One especially important (many speakers/literates) omission is the lack of //Indian scripts. #define PANGO_SCRIPT_ARMENIAN EncodingDetector::None #define PANGO_SCRIPT_BENGALI EncodingDetector::None #define PANGO_SCRIPT_CANADIAN_ABORIGINAL EncodingDetector::None #define PANGO_SCRIPT_CHEROKEE EncodingDetector::None #define PANGO_SCRIPT_DEVANAGARI EncodingDetector::None #define PANGO_SCRIPT_ETHIOPIC EncodingDetector::None #define PANGO_SCRIPT_GUJARATI EncodingDetector::None #define PANGO_SCRIPT_GURMUKHI EncodingDetector::None #define PANGO_SCRIPT_KANNADA EncodingDetector::None #define PANGO_SCRIPT_KHMER EncodingDetector::None #define PANGO_SCRIPT_LAO EncodingDetector::None #define PANGO_SCRIPT_MALAYALAM EncodingDetector::None #define PANGO_SCRIPT_MONGOLIAN EncodingDetector::None #define PANGO_SCRIPT_MYANMAR EncodingDetector::None #define PANGO_SCRIPT_ORIYA EncodingDetector::None #define PANGO_SCRIPT_SINHALA EncodingDetector::None #define PANGO_SCRIPT_SYRIAC EncodingDetector::None #define PANGO_SCRIPT_TAGALOG EncodingDetector::None #define PANGO_SCRIPT_TAMIL EncodingDetector::None #define PANGO_SCRIPT_TIBETAN EncodingDetector::None #define PANGO_SCRIPT_TELUGU EncodingDetector::None //Instead of changing the table even more... #define PANGO_SCRIPT_ARABIC EncodingDetector::Arabic #define PANGO_SCRIPT_CYRILLIC EncodingDetector::Cyrillic #define PANGO_SCRIPT_GEORGIAN EncodingDetector::SouthEasternEurope #define PANGO_SCRIPT_GREEK EncodingDetector::Greek #define PANGO_SCRIPT_HEBREW EncodingDetector::Hebrew #define PANGO_SCRIPT_LATIN EncodingDetector::WesternEuropean #define PANGO_SCRIPT_THAI EncodingDetector::Thai static const PangoScriptForLang pango_script_for_lang[] = { { "aa", { PANGO_SCRIPT_LATIN/*62*/ } }, { "ab", { PANGO_SCRIPT_CYRILLIC/*90*/ } }, { "af", { PANGO_SCRIPT_LATIN/*69*/ } }, { "am", { PANGO_SCRIPT_ETHIOPIC/*218*/ } }, { "ar", { PANGO_SCRIPT_ARABIC/*125*/ } }, { "as", { PANGO_SCRIPT_BENGALI/*89*/ } }, { "ast", { PANGO_SCRIPT_LATIN/*66*/ } }, { "ava", { PANGO_SCRIPT_CYRILLIC/*67*/ } }, { "ay", { PANGO_SCRIPT_LATIN/*60*/ } }, { "az-ir", { PANGO_SCRIPT_ARABIC/*129*/ } }, { "az", { PANGO_SCRIPT_CYRILLIC/*80*/ } }, //, PANGO_SCRIPT_LATIN/*68*/ } }, { "bam", { PANGO_SCRIPT_LATIN/*60*/ } }, { "ba", { PANGO_SCRIPT_CYRILLIC/*82*/ } }, { "be", { PANGO_SCRIPT_CYRILLIC/*68*/ } }, { "bg", { PANGO_SCRIPT_CYRILLIC/*60*/ } }, { "bh", { PANGO_SCRIPT_DEVANAGARI/*68*/ } }, { "bho", { PANGO_SCRIPT_DEVANAGARI/*68*/ } }, { "bi", { PANGO_SCRIPT_LATIN/*58*/ } }, { "bin", { PANGO_SCRIPT_LATIN/*76*/ } }, { "bn", { PANGO_SCRIPT_BENGALI/*89*/ } }, { "bo", { PANGO_SCRIPT_TIBETAN/*95*/ } }, { "br", { PANGO_SCRIPT_LATIN/*64*/ } }, { "bs", { PANGO_SCRIPT_LATIN/*62*/ } }, { "bua", { PANGO_SCRIPT_CYRILLIC/*70*/ } }, { "ca", { PANGO_SCRIPT_LATIN/*74*/ } }, { "ce", { PANGO_SCRIPT_CYRILLIC/*67*/ } }, { "chm", { PANGO_SCRIPT_CYRILLIC/*76*/ } }, { "chr", { PANGO_SCRIPT_CHEROKEE/*85*/ } }, { "ch", { PANGO_SCRIPT_LATIN/*58*/ } }, { "co", { PANGO_SCRIPT_LATIN/*84*/ } }, { "cs", { PANGO_SCRIPT_LATIN/*82*/ } }, { "cu", { PANGO_SCRIPT_CYRILLIC/*103*/ } }, { "cv", { PANGO_SCRIPT_CYRILLIC/*72*/ } }, //, PANGO_SCRIPT_LATIN/*2*/ } }, { "cy", { PANGO_SCRIPT_LATIN/*78*/ } }, { "da", { PANGO_SCRIPT_LATIN/*70*/ } }, { "de", { PANGO_SCRIPT_LATIN/*59*/ } }, { "dz", { PANGO_SCRIPT_TIBETAN/*95*/ } }, { "el", { PANGO_SCRIPT_GREEK/*69*/ } }, { "en", { PANGO_SCRIPT_LATIN/*72*/ } }, { "eo", { PANGO_SCRIPT_LATIN/*64*/ } }, { "es", { PANGO_SCRIPT_LATIN/*66*/ } }, // { "et", { PANGO_SCRIPT_LATIN/*64*/ } }, { "et", { EncodingDetector::Baltic } }, { "eu", { PANGO_SCRIPT_LATIN/*56*/ } }, { "fa", { PANGO_SCRIPT_ARABIC/*129*/ } }, { "fi", { PANGO_SCRIPT_LATIN/*62*/ } }, { "fj", { PANGO_SCRIPT_LATIN/*52*/ } }, { "fo", { PANGO_SCRIPT_LATIN/*68*/ } }, { "fr", { PANGO_SCRIPT_LATIN/*84*/ } }, { "ful", { PANGO_SCRIPT_LATIN/*62*/ } }, { "fur", { PANGO_SCRIPT_LATIN/*66*/ } }, { "fy", { PANGO_SCRIPT_LATIN/*75*/ } }, { "ga", { PANGO_SCRIPT_LATIN/*80*/ } }, { "gd", { PANGO_SCRIPT_LATIN/*70*/ } }, { "gez", { PANGO_SCRIPT_ETHIOPIC/*218*/ } }, { "gl", { PANGO_SCRIPT_LATIN/*66*/ } }, { "gn", { PANGO_SCRIPT_LATIN/*70*/ } }, { "gu", { PANGO_SCRIPT_GUJARATI/*78*/ } }, { "gv", { PANGO_SCRIPT_LATIN/*54*/ } }, { "ha", { PANGO_SCRIPT_LATIN/*60*/ } }, { "haw", { PANGO_SCRIPT_LATIN/*62*/ } }, { "he", { PANGO_SCRIPT_HEBREW/*27*/ } }, { "hi", { PANGO_SCRIPT_DEVANAGARI/*68*/ } }, { "ho", { PANGO_SCRIPT_LATIN/*52*/ } }, { "hr", { PANGO_SCRIPT_LATIN/*62*/ } }, { "hu", { PANGO_SCRIPT_LATIN/*70*/ } }, { "hy", { PANGO_SCRIPT_ARMENIAN/*77*/ } }, { "ia", { PANGO_SCRIPT_LATIN/*52*/ } }, { "ibo", { PANGO_SCRIPT_LATIN/*58*/ } }, { "id", { PANGO_SCRIPT_LATIN/*54*/ } }, { "ie", { PANGO_SCRIPT_LATIN/*52*/ } }, { "ik", { PANGO_SCRIPT_CYRILLIC/*68*/ } }, { "io", { PANGO_SCRIPT_LATIN/*52*/ } }, { "is", { PANGO_SCRIPT_LATIN/*70*/ } }, { "it", { PANGO_SCRIPT_LATIN/*72*/ } }, { "iu", { PANGO_SCRIPT_CANADIAN_ABORIGINAL/*161*/ } }, // { "ja", { PANGO_SCRIPT_HAN/*6356*/, PANGO_SCRIPT_KATAKANA/*88*/, PANGO_SCRIPT_HIRAGANA/*85*/ } }, { "ja", { EncodingDetector::Japanese } }, { "kaa", { PANGO_SCRIPT_CYRILLIC/*78*/ } }, { "ka", { PANGO_SCRIPT_GEORGIAN/*33*/ } }, { "ki", { PANGO_SCRIPT_LATIN/*56*/ } }, { "kk", { PANGO_SCRIPT_CYRILLIC/*77*/ } }, { "kl", { PANGO_SCRIPT_LATIN/*81*/ } }, { "km", { PANGO_SCRIPT_KHMER/*70*/ } }, { "kn", { PANGO_SCRIPT_KANNADA/*80*/ } }, // { "ko", { PANGO_SCRIPT_HANGUL/*2443*/ } }, { "ko", { EncodingDetector::Korean } }, { "kok", { PANGO_SCRIPT_DEVANAGARI/*68*/ } }, { "ks", { PANGO_SCRIPT_DEVANAGARI/*68*/ } }, { "ku-ir", { PANGO_SCRIPT_ARABIC/*32*/ } }, { "ku", { PANGO_SCRIPT_CYRILLIC/*60*/ } }, //, PANGO_SCRIPT_LATIN/*4*/ } }, { "kum", { PANGO_SCRIPT_CYRILLIC/*66*/ } }, { "kv", { PANGO_SCRIPT_CYRILLIC/*70*/ } }, { "kw", { PANGO_SCRIPT_LATIN/*64*/ } }, { "ky", { PANGO_SCRIPT_CYRILLIC/*70*/ } }, { "la", { PANGO_SCRIPT_LATIN/*68*/ } }, { "lb", { PANGO_SCRIPT_LATIN/*75*/ } }, { "lez", { PANGO_SCRIPT_CYRILLIC/*67*/ } }, { "ln", { PANGO_SCRIPT_LATIN/*78*/ } }, { "lo", { PANGO_SCRIPT_LAO/*65*/ } }, // { "lt", { PANGO_SCRIPT_LATIN/*70*/ } }, { "lt", { EncodingDetector::Baltic } }, // { "lv", { PANGO_SCRIPT_LATIN/*78*/ } }, { "lv", { EncodingDetector::Baltic } }, { "mg", { PANGO_SCRIPT_LATIN/*56*/ } }, { "mh", { PANGO_SCRIPT_LATIN/*62*/ } }, { "mi", { PANGO_SCRIPT_LATIN/*64*/ } }, { "mk", { PANGO_SCRIPT_CYRILLIC/*42*/ } }, { "ml", { PANGO_SCRIPT_MALAYALAM/*78*/ } }, { "mn", { PANGO_SCRIPT_MONGOLIAN/*130*/ } }, { "mo", { PANGO_SCRIPT_CYRILLIC/*66*/ } }, //, PANGO_SCRIPT_LATIN/*62*/ } }, { "mr", { PANGO_SCRIPT_DEVANAGARI/*68*/ } }, { "mt", { PANGO_SCRIPT_LATIN/*72*/ } }, { "my", { PANGO_SCRIPT_MYANMAR/*48*/ } }, { "nb", { PANGO_SCRIPT_LATIN/*70*/ } }, { "nds", { PANGO_SCRIPT_LATIN/*59*/ } }, { "ne", { PANGO_SCRIPT_DEVANAGARI/*68*/ } }, { "nl", { PANGO_SCRIPT_LATIN/*82*/ } }, { "nn", { PANGO_SCRIPT_LATIN/*76*/ } }, { "no", { PANGO_SCRIPT_LATIN/*70*/ } }, { "nr", { PANGO_SCRIPT_LATIN/*52*/ } }, { "nso", { PANGO_SCRIPT_LATIN/*58*/ } }, { "ny", { PANGO_SCRIPT_LATIN/*54*/ } }, { "oc", { PANGO_SCRIPT_LATIN/*70*/ } }, { "om", { PANGO_SCRIPT_LATIN/*52*/ } }, { "or", { PANGO_SCRIPT_ORIYA/*79*/ } }, { "os", { PANGO_SCRIPT_CYRILLIC/*66*/ } }, { "pa", { PANGO_SCRIPT_GURMUKHI/*63*/ } }, { "pl", { PANGO_SCRIPT_LATIN/*70*/ } }, { "ps-af", { PANGO_SCRIPT_ARABIC/*49*/ } }, { "ps-pk", { PANGO_SCRIPT_ARABIC/*49*/ } }, { "pt", { PANGO_SCRIPT_LATIN/*82*/ } }, { "rm", { PANGO_SCRIPT_LATIN/*66*/ } }, { "ro", { PANGO_SCRIPT_LATIN/*62*/ } }, { "ru", { PANGO_SCRIPT_CYRILLIC/*66*/ } }, { "sah", { PANGO_SCRIPT_CYRILLIC/*76*/ } }, { "sa", { PANGO_SCRIPT_DEVANAGARI/*68*/ } }, { "sco", { PANGO_SCRIPT_LATIN/*56*/ } }, { "sel", { PANGO_SCRIPT_CYRILLIC/*66*/ } }, { "se", { PANGO_SCRIPT_LATIN/*66*/ } }, { "sh", { PANGO_SCRIPT_CYRILLIC/*76*/ } }, { "si", { PANGO_SCRIPT_SINHALA/*77*/ } }, { "sk", { PANGO_SCRIPT_LATIN/*86*/ } }, { "sl", { PANGO_SCRIPT_LATIN/*62*/ } }, { "sma", { PANGO_SCRIPT_LATIN/*60*/ } }, { "smj", { PANGO_SCRIPT_LATIN/*60*/ } }, { "smn", { PANGO_SCRIPT_LATIN/*68*/ } }, { "sms", { PANGO_SCRIPT_LATIN/*80*/ } }, { "sm", { PANGO_SCRIPT_LATIN/*52*/ } }, { "so", { PANGO_SCRIPT_LATIN/*52*/ } }, { "sq", { PANGO_SCRIPT_LATIN/*56*/ } }, { "sr", { PANGO_SCRIPT_CYRILLIC/*76*/ } }, { "ss", { PANGO_SCRIPT_LATIN/*52*/ } }, { "st", { PANGO_SCRIPT_LATIN/*52*/ } }, { "sv", { PANGO_SCRIPT_LATIN/*68*/ } }, { "sw", { PANGO_SCRIPT_LATIN/*52*/ } }, { "syr", { PANGO_SCRIPT_SYRIAC/*45*/ } }, { "ta", { PANGO_SCRIPT_TAMIL/*48*/ } }, { "te", { PANGO_SCRIPT_TELUGU/*80*/ } }, { "tg", { PANGO_SCRIPT_CYRILLIC/*78*/ } }, { "th", { PANGO_SCRIPT_THAI/*86*/ } }, { "ti-er", { PANGO_SCRIPT_ETHIOPIC/*255*/ } }, { "ti-et", { PANGO_SCRIPT_ETHIOPIC/*255*/ } }, { "tig", { PANGO_SCRIPT_ETHIOPIC/*221*/ } }, { "tk", { PANGO_SCRIPT_CYRILLIC/*74*/ } }, { "tl", { PANGO_SCRIPT_TAGALOG/*19*/ } }, { "tn", { PANGO_SCRIPT_LATIN/*58*/ } }, { "to", { PANGO_SCRIPT_LATIN/*52*/ } }, // { "tr", { PANGO_SCRIPT_LATIN/*70*/ } }, { "tr", { EncodingDetector::Turkish } }, { "ts", { PANGO_SCRIPT_LATIN/*52*/ } }, { "tt", { PANGO_SCRIPT_CYRILLIC/*76*/ } }, { "tw", { PANGO_SCRIPT_LATIN/*70*/ } }, { "tyv", { PANGO_SCRIPT_CYRILLIC/*70*/ } }, { "ug", { PANGO_SCRIPT_ARABIC/*125*/ } }, { "uk", { PANGO_SCRIPT_CYRILLIC/*72*/ } }, { "ur", { PANGO_SCRIPT_ARABIC/*145*/ } }, { "uz", { PANGO_SCRIPT_CYRILLIC/*68*/ } }, { "ven", { PANGO_SCRIPT_LATIN/*62*/ } }, { "vi", { PANGO_SCRIPT_LATIN/*186*/ } }, { "vot", { PANGO_SCRIPT_LATIN/*62*/ } }, { "vo", { PANGO_SCRIPT_LATIN/*54*/ } }, { "wa", { PANGO_SCRIPT_LATIN/*70*/ } }, { "wen", { PANGO_SCRIPT_LATIN/*76*/ } }, { "wo", { PANGO_SCRIPT_LATIN/*66*/ } }, { "xh", { PANGO_SCRIPT_LATIN/*52*/ } }, { "yap", { PANGO_SCRIPT_LATIN/*58*/ } }, { "yi", { PANGO_SCRIPT_HEBREW/*27*/ } }, { "yo", { PANGO_SCRIPT_LATIN/*114*/ } }, // { "zh-cn", { PANGO_SCRIPT_HAN/*6763*/ } }, { "zh-cn", { EncodingDetector::ChineseSimplified } }, // { "zh-hk", { PANGO_SCRIPT_HAN/*2213*/ } }, { "zh-hk", { EncodingDetector::ChineseTraditional } }, // { "zh-mo", { PANGO_SCRIPT_HAN/*2213*/ } }, { "zh-mo", { EncodingDetector::ChineseTraditional } }, // { "zh-sg", { PANGO_SCRIPT_HAN/*6763*/ } }, { "zh-sg", { EncodingDetector::ChineseSimplified } }, // { "zh-tw", { PANGO_SCRIPT_HAN/*13063*/ } }, { "zh-tw", { EncodingDetector::ChineseTraditional } }, { "zu", { PANGO_SCRIPT_LATIN/*52*/ } }, { "\x00", { EncodingDetector::None } } //end mark }; enum MIB { MibLatin1 = 4, Mib8859_8 = 85, MibUtf8 = 106, MibUcs2 = 1000, MibUtf16 = 1015, MibUtf16BE = 1013, MibUtf16LE = 1014 }; static bool is16Bit(TQTextCodec* codec) { switch (codec->mibEnum()) { case MibUtf16: case MibUtf16BE: case MibUtf16LE: case MibUcs2: return true; default: return false; } } class EncodingDetectorPrivate { public: TQTextCodec *m_codec; TQTextDecoder *m_decoder; // utf16 TQTextCodec *m_defaultCodec; TQCString m_storeDecoderName; EncodingDetector::EncodingChoiceSource m_source; EncodingDetector::AutoDetectScript m_autoDetectLanguage; bool m_visualRTL : 1; bool m_seenBody : 1; bool m_writtingHappened : 1; bool m_analyzeCalled : 1; //for decode() int m_multiByte; TQCString m_bufferForDefferedEncDetection; EncodingDetectorPrivate() : m_codec(TQTextCodec::codecForMib(MibLatin1)) , m_decoder(m_codec->makeDecoder()) , m_defaultCodec(m_codec) , m_source(EncodingDetector::DefaultEncoding) , m_autoDetectLanguage(EncodingDetector::SemiautomaticDetection) , m_visualRTL(false) , m_seenBody(false) , m_writtingHappened(false) , m_analyzeCalled(false) , m_multiByte(0) { } EncodingDetectorPrivate(TQTextCodec* codec,EncodingDetector::EncodingChoiceSource source, EncodingDetector::AutoDetectScript script) : m_codec(codec) , m_decoder(m_codec->makeDecoder()) , m_defaultCodec(m_codec) , m_source(source) , m_autoDetectLanguage(script) , m_visualRTL(false) , m_seenBody(false) , m_writtingHappened(false) , m_analyzeCalled(false) , m_multiByte(0) { } ~EncodingDetectorPrivate() { delete m_decoder; } }; static TQCString automaticDetectionForArabic( const unsigned char* ptr, int size ) { for ( int i = 0; i < size; ++i ) { if ( ( ptr[ i ] >= 0x80 && ptr[ i ] <= 0x9F ) || ptr[ i ] == 0xA1 || ptr[ i ] == 0xA2 || ptr[ i ] == 0xA3 || ( ptr[ i ] >= 0xA5 && ptr[ i ] <= 0xAB ) || ( ptr[ i ] >= 0xAE && ptr[ i ] <= 0xBA ) || ptr[ i ] == 0xBC || ptr[ i ] == 0xBD || ptr[ i ] == 0xBE || ptr[ i ] == 0xC0 || ( ptr[ i ] >= 0xDB && ptr[ i ] <= 0xDF ) || ( ptr[ i ] >= 0xF3 ) ) { return "cp1256"; } } return "iso-8859-6"; } static TQCString automaticDetectionForBaltic( const unsigned char* ptr, int size ) { for ( int i = 0; i < size; ++i ) { if ( ( ptr[ i ] >= 0x80 && ptr[ i ] <= 0x9E ) ) return "cp1257"; if ( ptr[ i ] == 0xA1 || ptr[ i ] == 0xA5 ) return "iso-8859-13"; } return "iso-8859-13"; } static TQCString automaticDetectionForCentralEuropean(const unsigned char* ptr, int size ) { TQCString charset; for ( int i = 0; i < size; ++i ) { if ( ptr[ i ] >= 0x80 && ptr[ i ] <= 0x9F ) { if ( ptr[ i ] == 0x81 || ptr[ i ] == 0x83 || ptr[ i ] == 0x90 || ptr[ i ] == 0x98 ) return "ibm852"; if ( i + 1 > size ) return "cp1250"; else { // maybe ibm852 ? charset = "cp1250"; continue; } } if ( ptr[ i ] == 0xA5 || ptr[ i ] == 0xAE || ptr[ i ] == 0xBE || ptr[ i ] == 0xC3 || ptr[ i ] == 0xD0 || ptr[ i ] == 0xE3 || ptr[ i ] == 0xF0 ) { if ( i + 1 > size ) return "iso-8859-2"; else { // maybe ibm852 ? if ( charset.isNull() ) charset = "iso-8859-2"; continue; } } } if ( charset.isNull() ) charset = "iso-8859-3"; return charset.data(); } static TQCString automaticDetectionForCyrillic( const unsigned char* ptr, int size) { #ifdef DECODE_DEBUG kWarning() << "EncodingDetector: Cyr heuristics"; #endif // if (ptr[0]==0xef && ptr[1]==0xbb && ptr[2]==0xbf) // return "utf8"; int utf8_mark=0; int koi_score=0; int cp1251_score=0; int koi_st=0; int cp1251_st=0; // int koi_na=0; // int cp1251_na=0; int koi_o_capital=0; int koi_o=0; int cp1251_o_capital=0; int cp1251_o=0; int koi_a_capital=0; int koi_a=0; int cp1251_a_capital=0; int cp1251_a=0; int koi_s_capital=0; int koi_s=0; int cp1251_s_capital=0; int cp1251_s=0; int koi_i_capital=0; int koi_i=0; int cp1251_i_capital=0; int cp1251_i=0; int cp1251_small_range=0; int koi_small_range=0; int ibm866_small_range=0; int i; for (i=1; (i<size) && (cp1251_small_range+koi_small_range<1000) ;++i) { if (ptr[i]>0xdf) { ++cp1251_small_range; if (ptr[i]==0xee)//small o ++cp1251_o; else if (ptr[i]==0xe0)//small a ++cp1251_a; else if (ptr[i]==0xe8)//small i ++cp1251_i; else if (ptr[i]==0xf1)//small s ++cp1251_s; else if (ptr[i]==0xf2 && ptr[i-1]==0xf1)//small st ++cp1251_st; else if (ptr[i]==0xef) ++koi_o_capital; else if (ptr[i]==0xe1) ++koi_a_capital; else if (ptr[i]==0xe9) ++koi_i_capital; else if (ptr[i]==0xf3) ++koi_s_capital; } else if (ptr[i]>0xbf) { ++koi_small_range; if (ptr[i]==0xd0||ptr[i]==0xd1)//small o ++utf8_mark; else if (ptr[i]==0xcf)//small o ++koi_o; else if (ptr[i]==0xc1)//small a ++koi_a; else if (ptr[i]==0xc9)//small i ++koi_i; else if (ptr[i]==0xd3)//small s ++koi_s; else if (ptr[i]==0xd4 && ptr[i-1]==0xd3)//small st ++koi_st; else if (ptr[i]==0xce) ++cp1251_o_capital; else if (ptr[i]==0xc0) ++cp1251_a_capital; else if (ptr[i]==0xc8) ++cp1251_i_capital; else if (ptr[i]==0xd1) ++cp1251_s_capital; } else if (ptr[i]>0x9f && ptr[i]<0xb0) //first 16 letterz is 60% ++ibm866_small_range; } //cannot decide? if (cp1251_small_range+koi_small_range+ibm866_small_range<8) { return ""; } if (3*utf8_mark>cp1251_small_range+koi_small_range+ibm866_small_range) { #ifdef DECODE_DEBUG kWarning() << "Cyr Enc Detection: UTF8"; #endif return "UTF-8"; } if (ibm866_small_range>cp1251_small_range+koi_small_range) return "ibm866"; // TQCString koi_string = "koi8-u"; // TQCString cp1251_string = "cp1251"; if (cp1251_st==0 && koi_st>1) koi_score+=10; else if (koi_st==0 && cp1251_st>1) cp1251_score+=10; if (cp1251_st && koi_st) { if (cp1251_st/koi_st>2) cp1251_score+=20; else if (koi_st/cp1251_st>2) koi_score+=20; } if (cp1251_a>koi_a) cp1251_score+=10; else if (cp1251_a || koi_a) koi_score+=10; if (cp1251_o>koi_o) cp1251_score+=10; else if (cp1251_o || koi_o) koi_score+=10; if (cp1251_i>koi_i) cp1251_score+=10; else if (cp1251_i || koi_i) koi_score+=10; if (cp1251_s>koi_s) cp1251_score+=10; else if (cp1251_s || koi_s) koi_score+=10; if (cp1251_a_capital>koi_a_capital) cp1251_score+=9; else if (cp1251_a_capital || koi_a_capital) koi_score+=9; if (cp1251_o_capital>koi_o_capital) cp1251_score+=9; else if (cp1251_o_capital || koi_o_capital) koi_score+=9; if (cp1251_i_capital>koi_i_capital) cp1251_score+=9; else if (cp1251_i_capital || koi_i_capital) koi_score+=9; if (cp1251_s_capital>koi_s_capital) cp1251_score+=9; else if (cp1251_s_capital || koi_s_capital) koi_score+=9; #ifdef DECODE_DEBUG kWarning()<<"koi_score " << koi_score << " cp1251_score " << cp1251_score; #endif if (abs(koi_score-cp1251_score)<10) { //fallback... cp1251_score=cp1251_small_range; koi_score=koi_small_range; } if (cp1251_score>koi_score) return "cp1251"; else return "koi8-u"; // if (cp1251_score>koi_score) // setEncoding("cp1251",AutoDetectedEncoding); // else // setEncoding("koi8-u",AutoDetectedEncoding); // return true; } static TQCString automaticDetectionForGreek( const unsigned char* ptr, int size ) { for ( int i = 0; i < size; ++i ) { if ( ptr[ i ] == 0x80 || ( ptr[ i ] >= 0x82 && ptr[ i ] <= 0x87 ) || ptr[ i ] == 0x89 || ptr[ i ] == 0x8B || ( ptr[ i ] >= 0x91 && ptr[ i ] <= 0x97 ) || ptr[ i ] == 0x99 || ptr[ i ] == 0x9B || ptr[ i ] == 0xA4 || ptr[ i ] == 0xA5 || ptr[ i ] == 0xAE ) { return "cp1253"; } } return "iso-8859-7"; } static TQCString automaticDetectionForHebrew( const unsigned char* ptr, int size ) { for ( int i = 0; i < size; ++i ) { if ( ptr[ i ] == 0x80 || ( ptr[ i ] >= 0x82 && ptr[ i ] <= 0x89 ) || ptr[ i ] == 0x8B || ( ptr[ i ] >= 0x91 && ptr[ i ] <= 0x99 ) || ptr[ i ] == 0x9B || ptr[ i ] == 0xA1 || ( ptr[ i ] >= 0xBF && ptr[ i ] <= 0xC9 ) || ( ptr[ i ] >= 0xCB && ptr[ i ] <= 0xD8 ) ) { return "cp1255"; } if ( ptr[ i ] == 0xDF ) return "iso-8859-8-i"; } return "iso-8859-8-i"; } static TQCString automaticDetectionForJapanese( const unsigned char* ptr, int size ) { JapaneseCode kc; switch ( kc.guess_jp( (const char*)ptr, size ) ) { case JapaneseCode::JIS: return "jis7"; case JapaneseCode::EUC: return "eucjp"; case JapaneseCode::SJIS: return "sjis"; case JapaneseCode::UTF8: return "utf8"; default: break; } return ""; } static TQCString automaticDetectionForTurkish( const unsigned char* ptr, int size ) { for ( int i = 0; i < size; ++i ) { if ( ptr[ i ] == 0x80 || ( ptr[ i ] >= 0x82 && ptr[ i ] <= 0x8C ) || ( ptr[ i ] >= 0x91 && ptr[ i ] <= 0x9C ) || ptr[ i ] == 0x9F ) { return "cp1254"; } } return "iso-8859-9"; } static TQCString automaticDetectionForWesternEuropean( const unsigned char* ptr, int size ) { uint nonansi_count=0; for (int i=0; i<size; ++i) { if (ptr[i]>0x79) { ++nonansi_count; if ( ptr[i]>0xc1 && ptr[i]<0xf0 && i+1<size && ptr[i+1]>0x7f && ptr[i+1]<0xc0) { return "UTF-8"; } if (ptr[i] >= 0x78 && ptr[i] <= 0x9 ) { return "cp1252"; } } } if (nonansi_count>0) return "iso-8859-15"; return ""; } bool EncodingDetector::errorsIfUtf8 (const char* data, int length) { if (d->m_codec->mibEnum()!=MibUtf8) return false; //means no errors // #define highest1Bits (unsigned char)0x80 // #define highest2Bits (unsigned char)0xC0 // #define highest3Bits (unsigned char)0xE0 // #define highest4Bits (unsigned char)0xF0 // #define highest5Bits (unsigned char)0xF8 static const unsigned char highest1Bits = 0x80; static const unsigned char highest2Bits = 0xC0; static const unsigned char highest3Bits = 0xE0; static const unsigned char highest4Bits = 0xF0; static const unsigned char highest5Bits = 0xF8; for (int i=0; i<length; ++i) { unsigned char c = data[i]; if (d->m_multiByte>0) { if ((c & highest2Bits) == 0x80) { --(d->m_multiByte); continue; } #ifdef DECODE_DEBUG kWarning() << "EncDetector: Broken UTF8"; #endif return true; } // most significant bit zero, single char if ((c & highest1Bits) == 0x00) continue; // 110xxxxx => init 1 following bytes if ((c & highest3Bits) == 0xC0) { d->m_multiByte = 1; continue; } // 1110xxxx => init 2 following bytes if ((c & highest4Bits) == 0xE0) { d->m_multiByte = 2; continue; } // 11110xxx => init 3 following bytes if ((c & highest5Bits) == 0xF0) { d->m_multiByte = 3; continue; } #ifdef DECODE_DEBUG kWarning() << "EncDetector:_Broken UTF8"; #endif return true; } return false; } EncodingDetector::EncodingDetector() : d(new EncodingDetectorPrivate) { } EncodingDetector::EncodingDetector(TQTextCodec* codec, EncodingChoiceSource source, AutoDetectScript script) : d(new EncodingDetectorPrivate(codec,source,script)) { } EncodingDetector::~EncodingDetector() { delete d; } void EncodingDetector::setAutoDetectLanguage( EncodingDetector::AutoDetectScript lang) { d->m_autoDetectLanguage=lang; } EncodingDetector::AutoDetectScript EncodingDetector::autoDetectLanguage() const { return d->m_autoDetectLanguage; } EncodingDetector::EncodingChoiceSource EncodingDetector::encodingChoiceSource() const { return d->m_source; } const char* EncodingDetector::encoding() const { d->m_storeDecoderName = d->m_codec->name(); d->m_storeDecoderName = d->m_storeDecoderName.lower().replace( "iso ", "iso-" ); return d->m_storeDecoderName.data(); } bool EncodingDetector::visuallyOrdered() const { return d->m_visualRTL; } // const TQTextCodec* EncodingDetector::codec() const // { // return d->m_codec; // } TQTextDecoder* EncodingDetector::decoder() { return d->m_decoder; } bool EncodingDetector::setEncoding(const char *_encoding, EncodingChoiceSource type) { TQTextCodec *codec; TQCString enc(_encoding); if(/*enc.isNull() || */enc.isEmpty()) { if (type==DefaultEncoding) codec=d->m_defaultCodec; else return false; } else { //TQString->TQTextCodec enc = enc.lower(); // hebrew visually ordered if(enc=="visual") enc="iso8859-8"; bool b; codec = TDEGlobal::charsets()->codecForName(enc, b); if (!b) return false; } if (d->m_codec->mibEnum()==codec->mibEnum()) return true; if ((type==EncodingFromMetaTag || type==EncodingFromXMLHeader) && is16Bit(codec)) { //Sometimes the codec specified is absurd, i.e. UTF-16 despite //us decoding a meta tag as ASCII. In that case, ignore it. return false; } if (codec->mibEnum() == Mib8859_8) { //We do NOT want to use TQt's TQHebrewCodec, since it tries to reorder itself. codec = TQTextCodec::codecForName("iso8859-8-i"); // visually ordered unless one of the following if(!(enc=="iso-8859-8-i"||enc=="iso_8859-8-i"||enc=="csiso88598i"||enc=="logical")) d->m_visualRTL = true; } d->m_codec = codec; d->m_source = type; delete d->m_decoder; d->m_decoder = d->m_codec->makeDecoder(); #ifdef DECODE_DEBUG kDebug(6005) << "EncodingDetector::encoding used is" << d->m_codec->name(); #endif return true; } bool EncodingDetector::analyze(const TQByteArray &data) { return analyze( data.data(), data.size() ); } bool EncodingDetector::analyze(const char *data, int len) { // Check for UTF-16 or UTF-8 BOM mark at the beginning, which is a sure sign of a Unicode encoding. // maximumBOMLength = 10 // Even if the user has chosen utf16 we still need to auto-detect the endianness if (len >= 10 && ((d->m_source != UserChosenEncoding) || is16Bit(d->m_codec))) { // Extract the first three bytes. const uchar *udata = (const uchar *)data; uchar c1 = *udata++; uchar c2 = *udata++; uchar c3 = *udata++; // Check for the BOM const char *autoDetectedEncoding; if ((c1 == 0xFE && c2 == 0xFF) || (c1 == 0xFF && c2 == 0xFE)) { autoDetectedEncoding = "ISO-10646-UCS-2"; } else if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF) { autoDetectedEncoding = "UTF-8"; } else if (c1 == 0x00 || c2 == 0x00) { uchar c4 = *udata++; uchar c5 = *udata++; uchar c6 = *udata++; uchar c7 = *udata++; uchar c8 = *udata++; uchar c9 = *udata++; uchar c10 = *udata++; int nul_count_even = (c2 != 0) + (c4 != 0) + (c6 != 0) + (c8 != 0) + (c10 != 0); int nul_count_odd = (c1 != 0) + (c3 != 0) + (c5 != 0) + (c7 != 0) + (c9 != 0); if ((nul_count_even==0 && nul_count_odd==5) || (nul_count_even==5 && nul_count_odd==0)) autoDetectedEncoding = "ISO-10646-UCS-2"; else autoDetectedEncoding = 0; } else { autoDetectedEncoding = 0; } // If we found a BOM, use the encoding it implies. if (autoDetectedEncoding != 0) { d->m_source = BOM; d->m_codec = TQTextCodec::codecForName(autoDetectedEncoding); assert(d->m_codec); //enc = d->m_codec->name(); delete d->m_decoder; d->m_decoder = d->m_codec->makeDecoder(); #ifdef DECODE_DEBUG kWarning() << "Detection by BOM"; #endif if (is16Bit(d->m_codec) && c2==0x00) { // utf16LE, we need to put the decoder in LE mode char reverseUtf16[3] = {(char)0xFF, (char)0xFE, 0x00}; d->m_decoder->toUnicode(reverseUtf16, 2); } return true; } } //exit from routine in case it was called to only detect byte order for utf-16 if (d->m_source==UserChosenEncoding) { #ifdef DECODE_DEBUG kWarning() << "EncodingDetector: UserChosenEncoding exit "; #endif if (errorsIfUtf8(data, len)) setEncoding("",DefaultEncoding); return true; } #if 0 //This is for plaintext, so don't try to parse HTML headers -- ahartmetz if (!d->m_seenBody) { // we still don't have an encoding, and are in the head // the following tags are allowed in <head>: // SCRIPT|STYLE|META|LINK|OBJECT|TITLE|BASE const char *ptr = data; const char *pEnd = data+len; while(ptr != pEnd) { if(*ptr!='<') { ++ptr; continue; } ++ptr; // Handle comments. if (ptr[0] == '!' && ptr[1] == '-' && ptr[2] == '-') { ptr += 3; skipComment(ptr, pEnd); continue; } // Handle XML header, which can have encoding in it. if (ptr[0]=='?' && ptr[1]=='x' && ptr[2]=='m' && ptr[3]=='l') { const char *end = ptr; while (*end != '>' && end < pEnd) end++; if (*end == '\0' || end == pEnd) break; TQCString str(ptr, end - ptr + 1); int length; int pos = findXMLEncoding(str, length); // also handles the case when specified encoding aint correct if (pos!=-1 && setEncoding(str.mid(pos, length), EncodingFromXMLHeader)) { return true; } } //look for <meta>, stop if we reach <body> while ( !((*ptr >= 'a') && (*ptr <= 'z') || (*ptr >= 'A') && (*ptr <= 'Z')) && ptr < pEnd ) ++ptr; char tmp[5]; int length=0; const char* max=ptr+4; if (pEnd<max) max=pEnd; while ( ((*ptr >= 'a') && (*ptr <= 'z') || (*ptr >= 'A') && (*ptr <= 'Z') || (*ptr >= '0') && (*ptr <= '9')) && ptr < max ) { tmp[length] = tolower( *ptr ); ++ptr; ++length; } tmp[length] = 0; if (tmp[0]=='m'&&tmp[1]=='e'&&tmp[2]=='t'&&tmp[3]=='a') { // found a meta tag... const char* end = ptr; while(*end != '>' && *end != '\0' && end<pEnd) end++; //if ( *end == '\0' ) break; TQCString str( ptr, (end-ptr)+1); str = str.lower(); int pos=0; //if( (pos = str.find("http-equiv", pos)) == -1) break; //if( (pos = str.find("content-type", pos)) == -1) break; if( (pos = str.find("charset")) == -1) continue; pos+=6; // skip to '=' if( (pos = str.find('=', pos)) == -1) continue; // skip whitespace before encoding itself while (pos < (int)str.length() && str[pos] <= ' ') ++pos; if ( pos == (int)str.length()) continue; int endpos = pos; while( endpos < str.length() && (str[endpos] != ' ' && str[endpos] != '"' && str[endpos] != '\'' && str[endpos] != ';' && str[endpos] != '>') ) ++endpos; #ifdef DECODE_DEBUG kDebug( 6005 ) << "EncodingDetector: found charset in <meta>: " << str.mid(pos,endpos-pos).data(); #endif if (setEncoding(str.mid(pos,endpos-pos), EncodingFromMetaTag)) return true; } else if (tmp[0]=='b'&&tmp[1]=='o'&&tmp[2]=='d'&&tmp[3]=='y') { d->m_seenBody=true; break; } } } if (d->m_source==EncodingFromHTTPHeader) return true; #endif //if (len<20) //make a guess even if the file is short -- ahartmetz if (len < 1) { setEncoding("",DefaultEncoding); return false; } #ifdef DECODE_DEBUG kDebug( 6005 ) << "EncodingDetector: using heuristics (" << strlen(data) << ")"; #endif switch ( d->m_autoDetectLanguage ) { case EncodingDetector::Arabic: return setEncoding(automaticDetectionForArabic( (const unsigned char*) data, len ), AutoDetectedEncoding); // break; case EncodingDetector::Baltic: return setEncoding(automaticDetectionForBaltic( (const unsigned char*) data, len ), AutoDetectedEncoding); // break; case EncodingDetector::CentralEuropean: return setEncoding(automaticDetectionForCentralEuropean( (const unsigned char*) data, len ), AutoDetectedEncoding); break; case EncodingDetector::Cyrillic: return setEncoding(automaticDetectionForCyrillic( (const unsigned char*) data, len), AutoDetectedEncoding); // break; case EncodingDetector::Greek: return setEncoding(automaticDetectionForGreek( (const unsigned char*) data, len ), AutoDetectedEncoding); // break; case EncodingDetector::Hebrew: return setEncoding(automaticDetectionForHebrew( (const unsigned char*) data, len ), AutoDetectedEncoding); // break; case EncodingDetector::Japanese: return setEncoding(automaticDetectionForJapanese( (const unsigned char*) data, len ), AutoDetectedEncoding); // break; case EncodingDetector::Turkish: return setEncoding(automaticDetectionForTurkish( (const unsigned char*) data, len ), AutoDetectedEncoding); // break; case EncodingDetector::WesternEuropean: if (setEncoding(automaticDetectionForWesternEuropean( (const unsigned char*) data, len ), AutoDetectedEncoding)) return true; else if (d->m_defaultCodec->mibEnum()==MibLatin1) //detection for tdehtml { return setEncoding("iso-8859-15",AutoDetectedEncoding); } else //use default provided by eg katepart { return setEncoding("",DefaultEncoding); } // break; case EncodingDetector::SemiautomaticDetection: case EncodingDetector::ChineseSimplified: case EncodingDetector::ChineseTraditional: case EncodingDetector::Korean: case EncodingDetector::Thai: case EncodingDetector::Unicode: case EncodingDetector::NorthernSaami: case EncodingDetector::SouthEasternEurope: case EncodingDetector::None: // huh. somethings broken in this code ### FIXME //enc = 0; //Reset invalid codec we tried, so we get back to latin1 fallback. break; } setEncoding("",DefaultEncoding); return true; } EncodingDetector::AutoDetectScript EncodingDetector::scriptForName(const TQString& lang) { if (lang.isEmpty()) return EncodingDetector::None; else if (lang==i18n("@item Text character set", "Unicode")) return EncodingDetector::Unicode; else if (lang==i18n("@item Text character set", "Cyrillic")) return EncodingDetector::Cyrillic; else if (lang==i18n("@item Text character set", "Western European")) return EncodingDetector::WesternEuropean; else if (lang==i18n("@item Text character set", "Central European")) return EncodingDetector::CentralEuropean; else if (lang==i18n("@item Text character set", "Greek")) return EncodingDetector::Greek; else if (lang==i18n("@item Text character set", "Hebrew")) return EncodingDetector::Hebrew; else if (lang==i18n("@item Text character set", "Turkish")) return EncodingDetector::Turkish; else if (lang==i18n("@item Text character set", "Japanese")) return EncodingDetector::Japanese; else if (lang==i18n("@item Text character set", "Baltic")) return EncodingDetector::Baltic; else if (lang==i18n("@item Text character set", "Arabic")) return EncodingDetector::Arabic; return EncodingDetector::None; } bool EncodingDetector::hasAutoDetectionForScript(EncodingDetector::AutoDetectScript script) { switch (script) { case EncodingDetector::Arabic: return true; case EncodingDetector::Baltic: return true; case EncodingDetector::CentralEuropean: return true; case EncodingDetector::Cyrillic: return true; case EncodingDetector::Greek: return true; case EncodingDetector::Hebrew: return true; case EncodingDetector::Japanese: return true; case EncodingDetector::Turkish: return true; case EncodingDetector::WesternEuropean: return true; case EncodingDetector::ChineseTraditional: return true; case EncodingDetector::ChineseSimplified: return true; case EncodingDetector::Unicode: return true; break; default: return false; } } TQString EncodingDetector::nameForScript(EncodingDetector::AutoDetectScript script) { switch (script) { case EncodingDetector::Arabic: return i18n("@item Text character set", "Arabic"); break; case EncodingDetector::Baltic: return i18n("@item Text character set", "Baltic"); break; case EncodingDetector::CentralEuropean: return i18n("@item Text character set", "Central European"); break; case EncodingDetector::Cyrillic: return i18n("@item Text character set", "Cyrillic"); break; case EncodingDetector::Greek: return i18n("@item Text character set", "Greek"); break; case EncodingDetector::Hebrew: return i18n("@item Text character set", "Hebrew"); break; case EncodingDetector::Japanese: return i18n("@item Text character set", "Japanese"); break; case EncodingDetector::Turkish: return i18n("@item Text character set", "Turkish"); break; case EncodingDetector::WesternEuropean: return i18n("@item Text character set", "Western European"); break; case EncodingDetector::ChineseTraditional: return i18n("@item Text character set", "Chinese Traditional"); break; case EncodingDetector::ChineseSimplified: return i18n("@item Text character set", "Chinese Simplified"); break; case EncodingDetector::Korean: return i18n("@item Text character set", "Korean"); break; case EncodingDetector::Thai: return i18n("@item Text character set", "Thai"); break; case EncodingDetector::Unicode: return i18n("@item Text character set", "Unicode"); break; //case EncodingDetector::SemiautomaticDetection: default: return TQString(); } } EncodingDetector::AutoDetectScript EncodingDetector::scriptForLanguageCode(const TQString &lc) { // It might make sense to do something special if the locale ends with // ".UTF-8" or "@utf8" const char *langStr = pango_script_for_lang[0].lang; // There is obvious optimization potential... for ( int i = 0; langStr; i++ ) { langStr = pango_script_for_lang[i].lang; // startsWith() works for empty strings: every string "starts with" an empty string. if ( lc.startsWith( TQString::fromAscii( langStr ) ) ) return pango_script_for_lang[i].scripts[0]; } return None; } #undef DECODE_DEBUG