1 files changed, 611 insertions, 0 deletions
diff --git a/src/codecs/tqhebrewcodec.cpp b/src/codecs/tqhebrewcodec.cpp
new file mode 100644
index 000000000..0140117b0
--- /dev/null
+++ b/src/codecs/tqhebrewcodec.cpp
@@ -0,0 +1,611 @@
+/****************************************************************************
+**
+** Implementation of TQTextCodec class
+**
+** Created : 981015
+**
+** Copyright (C) 1998-2008 Trolltech ASA.  All rights reserved.
+**
+** This file is part of the tools module of the TQt GUI Toolkit.
+**
+** This file may be used under the terms of the GNU General
+** Public License versions 2.0 or 3.0 as published by the Free
+** Software Foundation and appearing in the files LICENSE.GPL2
+** and LICENSE.GPL3 included in the packaging of this file.
+** Alternatively you may (at your option) use any later version
+** of the GNU General Public License if such license has been
+** publicly approved by Trolltech ASA (or its successors, if any)
+** and the KDE Free TQt Foundation.
+**
+** Please review the following information to ensure GNU General
+** Public Licensing requirements will be met:
+** http://trolltech.com/products/qt/licenses/licensing/opensource/.
+** If you are unsure which license is appropriate for your use, please
+** review the following information:
+** http://trolltech.com/products/qt/licenses/licensing/licensingoverview
+** or contact the sales department at [email protected].
+**
+** This file may be used under the terms of the Q Public License as
+** defined by Trolltech ASA and appearing in the file LICENSE.TQPL
+** included in the packaging of this file.  Licensees holding valid TQt
+** Commercial licenses may use this file in accordance with the TQt
+** Commercial License Agreement provided with the Software.
+**
+** This file is provided "AS IS" with NO WARRANTY OF ANY KIND,
+** INCLUDING THE WARRANTIES OF DESIGN, MERCHANTABILITY AND FITNESS FOR
+** A PARTICULAR PURPOSE. Trolltech reserves all rights not granted
+** herein.
+**
+**********************************************************************/
+
+#include "tqhebrewcodec.h"
+#include <private/tqtextengine_p.h>
+
+#ifndef TQT_NO_CODEC_HEBREW
+
+// NOT REVISED
+
+static const uchar unkn = '?'; // BLACK SQUARE (94) would be better
+
+static const ushort heb_to_unicode[128] = {
+    0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD,
+    0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD,
+    0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD,
+    0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD,
+    0x00A0, 0xFFFD, 0x00A2, 0x00A3, 0x00A4, 0x00A5, 0x00A6, 0x00A7,
+    0x00A8, 0x00A9, 0x00D7, 0x00AB, 0x00AC, 0x00AD, 0x00AE, 0x203E,
+    0x00B0, 0x00B1, 0x00B2, 0x00B3, 0x00B4, 0x00B5, 0x00B6, 0x00B7,
+    0x00B8, 0x00B9, 0x00F7, 0x00BB, 0x00BC, 0x00BD, 0x00BE, 0xFFFD,
+    0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD,
+    0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD,
+    0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD,
+    0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0x2017,
+    0x05D0, 0x05D1, 0x05D2, 0x05D3, 0x05D4, 0x05D5, 0x05D6, 0x05D7,
+    0x05D8, 0x05D9, 0x05DA, 0x05DB, 0x05DC, 0x05DD, 0x05DE, 0x05DF,
+    0x05E0, 0x05E1, 0x05E2, 0x05E3, 0x05E4, 0x05E5, 0x05E6, 0x05E7,
+    0x05E8, 0x05E9, 0x05EA, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD
+};
+
+static const uchar unicode_to_heb_00[32] = {
+    0xA0, unkn, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7,
+    0xA8, 0xA9, 0xD7, 0xAB, 0xAC, 0xAD, 0xAE, unkn,
+    0xB0, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7,
+    0xB8, 0xB9, 0xF7, 0xBB, 0xBC, 0xBD, 0xBE, unkn,
+};
+
+static const uchar unicode_to_heb_05[32] = {
+    0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
+    0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
+    0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
+    0xF8, 0xF9, 0xFA, unkn, unkn, unkn, unkn, unkn
+};
+
+static bool to8bit(const TQChar ch, TQCString *rstr)
+{
+    bool converted = FALSE;
+
+    if( ch.isMark() ) return TRUE; // ignore marks for conversion
+
+    if ( ch.row() ) {
+	if ( ch.row() == 0x05 ) {
+	    if ( ch.cell() > 0x91 )
+		converted = TRUE;
+	    // 0x0591 - 0x05cf: Hebrew punctuation... dropped
+	    if ( ch.cell() >= 0xD0 )
+		*rstr += (char)unicode_to_heb_05[ch.cell()- 0xD0];
+	} else if ( ch.row() == 0x20 ) {
+	    if ( ch.cell() == 0x3E ) {
+		*rstr += (char)0xAF;
+		converted = TRUE;
+	    } else if ( ch.cell() == 0x17 ) {
+		*rstr += (char)0xCF;
+		converted = TRUE;
+	    }
+	} else {
+	    converted = FALSE;
+	}
+    } else {
+	if ( ch.cell() < 0x80 ) {
+	    *rstr += (char)ch.cell();
+	    converted = TRUE;
+	} else if( ch.cell() < 0xA0 ) {
+	    *rstr += (char)unicode_to_heb_00[ch.cell() - 0x80];
+	    converted = TRUE;
+	}
+    }
+
+    if(converted) return TRUE;
+
+    // couldn't convert the char... lets try its decomposition
+    TQString d = ch.decomposition();
+    if(d.isNull())
+	return FALSE;
+
+    int l = d.length();
+    for (int i=0; i<l; i++) {
+	const TQChar ch = d[i];
+
+	if(to8bit(ch, rstr))
+	    converted = TRUE;
+    }
+
+    return converted;
+}
+
+#if 0
+static TQString run(const TQString &input, unsigned int from, unsigned int to, TQChar::Direction runDir)
+{
+    if ( to <= from )
+	return TQString::null;
+
+    TQString out;
+    if ( runDir == TQChar::DirR ) {
+	const TQChar *ch = input.unicode() + to - 1;
+	int len = to - from;
+	while (len--) {
+	    out += *ch;
+	    ch--;
+	}
+    } else {
+	out = input.mid(from, to - from );
+    }
+    return out;
+}
+
+/*
+  we might do better here, but I'm currently not sure if it's worth the effort. It will hopefully convert
+  90% of the visually ordered Hebrew correctly.
+*/
+static TQString reverseLine(const TQString &str, unsigned int from, unsigned int to, TQChar::Direction dir)
+{
+    TQString out;
+
+    if ( to <= from ) {
+	out += str.at(from);
+	return out;
+    }
+
+    // since we don't have embedding marks, we get around with bidi levels up to 2.
+
+    // simple case: dir = RTL:
+    // go through the line from right to left, and reverse all continuous Hebrew strings.
+    if ( dir == TQChar::DirR ) {
+	unsigned int pos = to;
+	to = from;
+	from = pos;
+	TQChar::Direction runDir = TQChar::DirON;
+
+	while ( pos > to ) {
+	    TQChar::Direction d = str.at(pos).direction();
+	    switch ( d ) {
+		case TQChar::DirL:
+		case TQChar::DirAN:
+		case TQChar::DirEN:
+		    if ( runDir != TQChar::DirL ) {
+			out += run( str, pos, from, runDir );
+			from = pos - 1;
+		    }
+		    runDir = TQChar::DirL;
+		    break;
+		case TQChar::DirON:
+		    if ( runDir == TQChar::DirON ) {
+			runDir = TQChar::DirR;
+			break;
+		    }
+		    // fall through
+		case TQChar::DirR:
+		    if ( runDir != TQChar::DirR ) {
+			out += run( str, pos, from, runDir );
+			from = pos - 1;
+		    }
+		    runDir = TQChar::DirR;
+		default:
+		    break;
+	    }
+	    pos--;
+	}
+	out += run( str, pos, from, runDir );
+    } else {
+	// basicDir == DirL. A bit more complicated, as we might need to reverse two times for numbers.
+	unsigned int pos = from;
+	TQChar::Direction runDir = TQChar::DirON;
+
+	// first reversing. Ignore numbers
+	while ( pos < to ) {
+	    TQChar::Direction d = str.at(pos).direction();
+	    switch ( d ) {
+		case TQChar::DirL:
+		    if ( runDir != TQChar::DirL && runDir != TQChar::DirON ) {
+			out += run( str, from, pos, runDir );
+			tqDebug( "out = %s", out.latin1() );
+			from = pos;
+		    }
+		    runDir = TQChar::DirL;
+		    break;
+		case TQChar::DirON:
+		    if ( runDir == TQChar::DirON ) {
+			runDir = TQChar::DirL;
+			break;
+		    }
+		    // fall through
+		case TQChar::DirR:
+		case TQChar::DirAN:
+		case TQChar::DirEN:
+		    if ( runDir != TQChar::DirR && runDir != TQChar::DirON ) {
+			out += run( str, from, pos, runDir );
+			tqDebug( "out = %s", out.latin1() );
+			from = pos;
+		    }
+		    runDir = TQChar::DirR;
+		default:
+		    break;
+	    }
+	    pos++;
+	}
+	out += run( str, from, pos, runDir );
+	tqDebug( "out = %s", out.latin1() );
+	// second reversing for numbers
+	TQString in = out;
+	out = "";
+	pos = 0;
+	from = 0;
+	to = in.length() - 1;
+	runDir = TQChar::DirON;
+	while ( pos < to ) {
+	    TQChar::Direction d = str.at(pos).direction();
+	    switch ( d ) {
+		case TQChar::DirL:
+		case TQChar::DirON:
+		case TQChar::DirR:
+		    if ( runDir == TQChar::DirEN && runDir != TQChar::DirON ) {
+			out += run( in, from, pos, TQChar::DirR ); //DirR ensures reversing
+			tqDebug( "out = %s", out.latin1() );
+			runDir = TQChar::DirR;
+			from = pos;
+		    }
+		    runDir = TQChar::DirL;
+		    break;
+		case TQChar::DirAN:
+		case TQChar::DirEN:
+		    if ( runDir != TQChar::DirEN && runDir != TQChar::DirON ) {
+			out += in.mid(from, pos-from+1);
+			tqDebug( "out = %s", out.latin1() );
+			from = pos;
+		    }
+		    runDir = TQChar::DirEN;
+		default:
+		    break;
+	    }
+	    pos++;
+	}
+	out += run( str, from, pos, runDir );
+
+    }
+    return out;
+}
+#endif
+
+/* this function assuems the TQString is still visually ordered.
+ * Finding the basic direction of the text is not easy in this case, since
+ * a string like "my friend MOLAHS" could (in logical order) mean aswell
+ * "SHALOM my friend" or "my friend SHALOM", depending on the basic direction
+ * one assumes for the text.
+ *
+ * So this function uses some heuristics to find the right answer...
+ */
+static TQChar::Direction findBasicDirection(TQString str)
+{
+    unsigned int pos;
+    unsigned int len = str.length();
+    TQChar::Direction dir1 = TQChar::DirON;
+    TQChar::Direction dir2 = TQChar::DirON;
+
+    unsigned int startLine = 0;
+    // If the visual representation of the first line starts and ends with the same
+    // directionality, we know the answer.
+    pos = 0;
+    while (pos < len) {
+	if ( str.at(pos) == '\n' )
+	    startLine = pos;
+	if (str.at(pos).direction() < 2) { // DirR or DirL
+	    dir1 = str.at(pos).direction();
+	    break;
+	}
+	pos++;
+    }
+
+    if( pos == len ) // no directional chars, assume TQChar::DirL
+	return TQChar::DirL;
+
+    // move to end of line
+    while( pos < len && str.at(pos) != '\n' )
+	pos++;
+
+    while (pos > startLine) {
+	if (str.at(pos).direction() < 2) { // DirR or DirL
+	    dir2 = str.at(pos).direction();
+	    break;
+	}
+	pos--;
+    }
+
+    // both are the same, so we have the direction!
+    if ( dir1 == dir2 ) return dir1;
+
+    // guess with the help of punktuation marks...
+    // if the sentence ends with a punktuation, we should have a mark
+    // at one side of the text...
+
+    pos = 0;
+    while (pos < len-1 ) {
+	if(str.at(pos).category() == TQChar::Punctuation_Other) {
+	    if( str.at(pos) != (char)0xbf && str.at(pos) != (char)0xa1 ) // spanish inverted question and exclamation mark
+		if( str.at(pos+1).direction() < 2 ) return TQChar::DirR;
+	}
+	pos++;
+    }
+
+    pos = len;
+    while (pos < 1 && str.at(pos).direction() < 2 ) {
+	if(str.at(pos).category() == TQChar::Punctuation_Other) {
+	    if( str.at(pos-1).direction() < 2 ) return TQChar::DirL;
+	}
+	pos--;
+    }
+
+    // don't know try DirR...
+    return TQChar::DirR;
+}
+
+
+/*!
+    \class TQHebrewCodec tqhebrewcodec.h
+    \reentrant
+    \ingroup i18n
+
+    \brief The TQHebrewCodec class provides conversion to and from
+    visually ordered Hebrew.
+
+    Hebrew as a semitic language is written from right to left.
+    Because older computer systems couldn't handle reordering a string
+    so that the first letter appears on the right, many older
+    documents were encoded in visual order, so that the first letter
+    of a line is the rightmost one in the string.
+
+    In contrast to this, Unicode defines characters to be in logical
+    order (the order you would read the string). This codec tries to
+    convert visually ordered Hebrew (8859-8) to Unicode. This might
+    not always work perfectly, because reversing the \e bidi
+    (bi-directional) algorithm that transforms from logical to visual
+    order is non-trivial.
+
+    Transformation from Unicode to visual Hebrew (8859-8) is done
+    using the bidi algorithm in TQt, and will produce correct results,
+    so long as the codec is given the text a whole paragraph at a
+    time. Places where newlines are supposed to go can be indicated by
+    a newline character ('\n'). Note that these newline characters
+    change the reordering behaviour of the algorithm, since the bidi
+    reordering only takes place within one line of text, whereas
+    line breaks are determined in visual order.
+
+    Visually ordered Hebrew is still used quite often in some places,
+    mainly in email communication (since most email programs still
+    don't understand logically ordered Hebrew) and on web pages. The
+    use on web pages is rapidly decreasing, due to the availability of
+    browsers that correctly support logically ordered Hebrew.
+
+    This codec has the name "iso8859-8". If you don't want any bidi
+    reordering to happen during conversion, use the "iso8859-8-i"
+    codec, which assumes logical order for the 8-bit string.
+*/
+
+/*! \reimp */
+int TQHebrewCodec::mibEnum() const
+{
+    return 11;
+}
+
+/*! \reimp */
+const char* TQHebrewCodec::name() const
+{
+    return "ISO 8859-8";
+}
+
+/*!
+    Returns the codec's mime name.
+*/
+const char* TQHebrewCodec::mimeName() const
+{
+    return "ISO-8859-8";
+}
+
+static TQString visualOrder(TQString logical, TQChar::Direction basicDir)
+{
+    logical.replace(TQChar('\n'), TQChar(0x2028));
+
+    TQTextEngine e(logical, 0);
+    e.direction = basicDir;
+    e.itemize();
+    TQ_UINT8 l[256];
+    TQ_UINT8 *levels = l;
+    int vo[256];
+    int *visualOrder = vo;
+    int nitems = e.items.size();
+    if (nitems > 255) {
+	levels = new TQ_UINT8[nitems];
+	visualOrder = new int[nitems];
+    }
+    int i;
+    for (i = 0; i < nitems; ++i) {
+	//tqDebug("item %d bidiLevel=%d", i,  e.items[i].analysis.bidiLevel);
+	levels[i] = e.items[i].analysis.bidiLevel;
+    }
+    e.bidiReorder(nitems, levels, visualOrder);
+
+    TQString visual;
+    for (i = 0; i < nitems; ++i) {
+	TQScriptItem &si = e.items[visualOrder[i]];
+	TQString sub = logical.mid(si.position, e.length(visualOrder[i]));
+	if (si.analysis.bidiLevel % 2) {
+	    // reverse sub
+	    TQChar *a = (TQChar *)sub.unicode();
+	    TQChar *b = a + sub.length() - 1;
+	    while (a < b) {
+		TQChar tmp = *a;
+		*a = *b;
+		*b = tmp;
+		++a;
+		--b;
+	    }
+	    a = (TQChar *)sub.unicode();
+	    b = a + sub.length();
+	    while (a<b) {
+		*a = a->mirroredChar();
+		++a;
+	    }
+	}
+	visual += sub;
+    }
+    // replace Unicode newline back with  \n to compare.
+    visual.replace(TQChar(0x2028), TQChar('\n'));
+    if (l != levels) {
+	delete [] levels;
+	delete [] visualOrder;
+    }
+    return visual;
+}
+
+/*!
+    \reimp
+
+    Since Hebrew (and Arabic) is written from left to right, but
+    iso8859-8 assumes visual ordering (as opposed to the logical
+    ordering of Unicode), we must reverse the order of the input
+    string (the first \a len characters of \a chars) to put it into
+    logical order.
+
+    One problem is that the basic text direction is unknown. So this
+    function uses some heuristics to guess it, and if it can't guess
+    the right one, it assumes, the basic text direction is right to
+    left.
+
+    This behaviour can be overridden, by putting a control character
+    at the beginning of the text to indicate which basic text
+    direction to use. If the basic text direction is left-to-right,
+    the control character should be (uchar) 0xFE. For right-to-left it
+    should be 0xFF. Both characters are undefined in the iso 8859-8
+    charset.
+
+    Example: A visually ordered string "english WERBEH american" would
+    be recognized as having a basic left to right direction. So the
+    logically ordered TQString would be "english HEBREW american".
+
+    By prepending a (uchar)0xFF at the start of the string,
+    TQHebrewCodec::toUnicode() would use a basic text direction of
+    right to left, and the string would thus become "american HEBREW
+    english".
+*/
+TQString TQHebrewCodec::toUnicode(const char* chars, int len ) const
+{
+    TQString r;
+    const unsigned char * c = (const unsigned char *)chars;
+    TQChar::Direction basicDir = TQChar::DirON; // neutral, we don't know
+
+    if( len == 0 ) return TQString::null;
+
+    // Test, if the user gives us a directionality.
+    // We use 0xFE and 0xFF in ISO8859-8 for that.
+    // These chars are undefined in the charset, and are mapped to
+    // RTL overwrite
+    if( c[0] == 0xfe ) {
+	basicDir = TQChar::DirL;
+	c++; // skip directionality hint
+    }
+    if( c[0] == 0xff ) {
+	basicDir = TQChar::DirR;
+	c++; // skip directionality hint
+    }
+
+    for( int i=0; i<len; i++ ) {
+	if ( c[i] > 127 )
+	    r[i] = heb_to_unicode[c[i]-128];
+	else
+	    r[i] = c[i];
+    }
+
+    // do transformation from visual byte ordering to logical byte
+    // ordering
+    if( basicDir == TQChar::DirON )
+	basicDir = findBasicDirection(r);
+
+    return visualOrder(r, basicDir);
+}
+
+/*!
+    Transforms the logically ordered TQString, \a uc, into a visually
+    ordered string in the 8859-8 encoding. TQt's bidi algorithm is used
+    to perform this task. Note that newline characters affect the
+    reordering, since reordering is done on a line by line basis.
+
+    The algorithm is designed to work on whole paragraphs of text, so
+    processing a line at a time may produce incorrect results. This
+    approach is taken because the reordering of the contents of a
+    particular line in a paragraph may depend on the previous line in
+    the same paragraph.
+
+    Some encodings (for example Japanese or UTF-8) are multibyte (so
+    one input character is mapped to two output characters). The \a
+    lenInOut argument specifies the number of TQChars that should be
+    converted and is set to the number of characters returned.
+*/
+TQCString TQHebrewCodec::fromUnicode(const TQString& uc, int& lenInOut) const
+{
+    // process only len chars...
+    int l;
+    if( lenInOut > 0 )
+	l = TQMIN((int)uc.length(),lenInOut);
+    else
+	l = (int)uc.length();
+
+    TQCString rstr;
+    if( l == 1 ) {
+	if( !to8bit( uc[0], &rstr ) )
+	    rstr += (char)unkn;
+    } else {
+	TQString tmp = uc;
+	tmp.truncate(l);
+	TQString vis = visualOrder(tmp, TQChar::DirON);
+
+	for (int i=0; i<l; i++) {
+	    const TQChar ch = vis[i];
+
+	    if( !to8bit( ch, &rstr ) )
+		rstr += (char)unkn;
+	}
+	// lenInOut = cursor - result;
+    }
+    if( l > 0 && !rstr.length() )
+	rstr += (char)unkn;
+
+    return rstr;
+}
+
+/*! \reimp
+ */
+int TQHebrewCodec::heuristicContentMatch(const char* chars, int len) const
+{
+    const unsigned char * c = (const unsigned char *)chars;
+
+    int score = 0;
+    for (int i=0; i<len; i++) {
+	if(c[i] > 0x80 ) {
+	    if ( heb_to_unicode[c[i] - 0x80] != 0xFFFD)
+		score++;
+	    else
+		return -1;
+	}
+    }
+    return score;
+}
+
+#endif