/**************************************************************************** ** Implementation of QGb18030Codec template/macro class ** ** Copyright (C) 1992-2008 Trolltech ASA. All rights reserved. ** ** This file is part of the tools module of the Qt GUI Toolkit. ** ** This file may be used under the terms of the GNU General ** Public License versions 2.0 or 3.0 as published by the Free ** Software Foundation and appearing in the files LICENSE.GPL2 ** and LICENSE.GPL3 included in the packaging of this file. ** Alternatively you may (at your option) use any later version ** of the GNU General Public License if such license has been ** publicly approved by Trolltech ASA (or its successors, if any) ** and the KDE Free Qt Foundation. ** ** Please review the following information to ensure GNU General ** Public Licensing requirements will be met: ** http://trolltech.com/products/qt/licenses/licensing/opensource/. ** If you are unsure which license is appropriate for your use, please ** review the following information: ** http://trolltech.com/products/qt/licenses/licensing/licensingoverview ** or contact the sales department at sales@trolltech.com. ** ** This file may be used under the terms of the Q Public License as ** defined by Trolltech ASA and appearing in the file LICENSE.QPL ** included in the packaging of this file. Licensees holding valid Qt ** Commercial licenses may use this file in accordance with the Qt ** Commercial License Agreement provided with the Software. ** ** This file is provided "AS IS" with NO WARRANTY OF ANY KIND, ** INCLUDING THE WARRANTIES OF DESIGN, MERCHANTABILITY AND FITNESS FOR ** A PARTICULAR PURPOSE. Trolltech reserves all rights not granted ** herein. ** **********************************************************************/ /*! \class QGb18030Codec qgb18030codec.h \reentrant \ingroup i18n \brief The QGb18030Codec class provides conversion to and from the Chinese GB18030/GBK/GB2312 encoding. \omit Last updated: September, 3, 2002 \endomit GBK, formally the Chinese Internal Code Specification, is a commonly used extension of GB 2312-80. Microsoft Windows uses it under the name codepage 936. GBK has been superceded by the new Chinese national standard GB 18030-2000, which added a 4-byte encoding while remaining compatible with GB2312 and GBK. The new GB 18030-2000 may be described as a special encoding of Unicode 3.x and ISO-10646-1. Special thanks to charset gurus Markus Scherer (IBM), Dirk Meyer (Adobe Systems) and Ken Lunde (Adobe Systems) for publishing an excellent GB 18030-2000 summary and specification on the Internet. Some must-read documents are: \list \i \l{ftp://ftp.oreilly.com/pub/examples/nutshell/cjkv/pdf/GB18030_Summary.pdf} \i \l{http://oss.software.ibm.com/cvs/icu/~checkout~/charset/source/gb18030/gb18030.html} \i \l{http://oss.software.ibm.com/cvs/icu/~checkout~/charset/data/xml/gb-18030-2000.xml} \endlist The GBK codec was contributed to Qt by Justin Yu \ and Sean Chen \. They may also be reached at Yu Mingjian \, \ Chen Xiangyang \ The GB18030 codec Qt functions were contributed to Qt by James Su \, \ who pioneered much of GB18030 development on GNU/Linux systems. The GB18030 codec was contributed to Qt by Anthony Fok \, \ using a Perl script to generate C++ tables from gb-18030-2000.xml while merging contributions from James Su, Justin Yu and Sean Chen. A copy of the source Perl script is available at: \l{http://people.debian.org/~foka/gb18030/gen-qgb18030codec.pl} The copyright notice for their code follows: \legalese Copyright (C) 2000 TurboLinux, Inc. Written by Justin Yu and Sean Chen. Copyright (C) 2001, 2002 Turbolinux, Inc. Written by James Su. Copyright (C) 2001, 2002 ThizLinux Laboratory Ltd. Written by Anthony Fok. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: \list 1 \i Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. \i Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. \endlist THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "qgb18030codec.h" #if (QT_VERSION-0 >= 0x040000) #error "move obsolete header into the src/compat directory" #endif #ifndef QT_NO_BIG_CODECS #define InRange(c, lower, upper) (((c) >= (lower)) && ((c) <= (upper))) #define IsLatin(c) ((c) <= 0x7F) #define IsByteInGb2312(c) (InRange((c), 0xA1, 0xFE)) #define Is1stByte(c) (InRange((c), 0x81, 0xFE)) #define Is2ndByteIn2Bytes(c) (InRange((c), 0x40, 0xFE) && (c) != 0x7F) #define Is2ndByteIn4Bytes(c) (InRange((c), 0x30, 0x39)) #define Is2ndByte(c) (Is2ndByteIn2Bytes(c) || Is2ndByteIn4Bytes(c)) #define Is3rdByte(c) (InRange((c), 0x81, 0xFE)) #define Is4thByte(c) (InRange((c), 0x30, 0x39)) #define QValidChar(u) ((u) ? QChar((ushort)(u)) : QChar::replacement) /* User-defined areas: UDA 1: 0xAAA1 - 0xAFFE (564/0) UDA 2: 0xF8A1 - 0xFEFE (658/0) UDA 3: 0xA140 - 0xA7A0 (672/0) */ #define IsUDA1(a, b) (InRange((a), 0xAA, 0xAF) && InRange((b), 0xA1, 0xFE)) #define IsUDA2(a, b) (InRange((a), 0xF8, 0xFE) && InRange((b), 0xA1, 0xFE)) #define IsUDA3(a, b) (InRange((a), 0xA1, 0xA7) && InRange((b), 0x40, 0xA0) && ((b) != 0x7F)) typedef struct { Q_UINT8 tblBegin; Q_UINT8 tblEnd; Q_UINT16 tblOffset; Q_UINT16 algOffset; } indexTbl_t; static uint qt_Gb18030ToUnicode(const uchar *gbstr, int& len); static int qt_UnicodeToGb18030(uint unicode, uchar *gbchar); int qt_UnicodeToGbk(uint unicode, uchar *gbchar); /*! \internal */ QGb18030Codec::QGb18030Codec() { } /*! \reimp */ const char* QGb18030Codec::name() const { //tqDebug("QGb18030Codec::name() = \"GB18030\""); return "GB18030"; } /*! \reimp */ int QGb18030Codec::mibEnum() const { return 114; } /*! \reimp */ QCString QGb18030Codec::fromUnicode(const QString& uc, int& lenInOut) const { int l = QMIN((int)uc.length(),(lenInOut<0)?(int)uc.length():lenInOut); int rlen = l*4+1; QCString rstr(rlen); uchar* cursor = (uchar*)rstr.data(); //tqDebug("QGb18030Codec::fromUnicode(const QString& uc, int& lenInOut = %d)", lenInOut); for (int i=0; i= 0xdc00) *cursor++ = '?'; else { unsigned short low = uc[i+1].unicode(); if (low >= 0xdc00 && low <= 0xdfff) { // valid surrogate pair ++i; uint u = (high-0xd800)*0x400+(low-0xdc00)+0x10000; len = qt_UnicodeToGb18030(u, buf); if (len >= 2) { for (int j=0; j= 2 ) { for (int j=0; j= 0xA1) && (buf[1] >= 0xA1) ) { *cursor++ = buf[0]; *cursor++ = buf[1]; } else { // Error *cursor++ = '?'; // unknown char } } lenInOut = cursor - (uchar*)rstr.data(); rstr.truncate(lenInOut); return rstr; } /*! \reimp */ QString QGb2312Codec::toUnicode(const char* chars, int len) const { QString result; int clen; //tqDebug("QGb2312Codec::toUnicode(const char* chars, int len = %d)", len); for (int i=0; i= 2 ) { uchar second = gbstr[1]; if ( Is2ndByteIn2Bytes(second) ) { len = 2; if (IsUDA1(first, second)) uni = 0xE000 + (first - 0xAA) * 94 + (second - 0xA1); else if (IsUDA2(first, second)) uni = 0xE234 + (first - 0xF8) * 94 + (second - 0xA1); else if (IsUDA3(first, second)) uni = 0xE4C6 + (first - 0xA1) * 96 + (second - 0x40) - ((second >= 0x80) ? 1 : 0); else { // Use the mapping table uint i; i = (first - 0x81) * 190 + (second - 0x40) - ((second >= 0x80) ? 1 : 0); if (InRange(first, 0xA1, 0xA7)) i -= (first - 0xA0) * 96; if (first > 0xA7) i -= 672; if (InRange(first, 0xAA, 0xAF)) i -= (first - 0xAA) * 94; if (first > 0xAF) i -= 564; if (first >= 0xF8) i -= (first - 0xF8) * 94; uni = (uint)gb18030_2byte_to_ucs[i]; } } else if ( Is2ndByteIn4Bytes(second) && len >= 4 ) { uchar third = gbstr[2], fourth = gbstr[3]; if ( Is3rdByte(third) && Is4thByte(fourth) ) { // Valid 4-byte GB18030, whether defined or not uint gb4lin; indexTbl_t g2u; gb4lin = (first - 0x81) * 12600 + (second - 0x30) * 1260 + (third - 0x81) * 10 + (fourth - 0x30); len = 4; if ( gb4lin <= 0x99FB ) { /* GB+81308130 - GB+8431A439 */ g2u = gb18030_to_ucs_index[gb4lin >> 8]; if ((Q_UINT8)(gb4lin & 0xFF) >= g2u.tblBegin && (Q_UINT8)(gb4lin & 0xFF) <= g2u.tblEnd) { uni = (uint)gb18030_4byte_to_ucs[gb4lin - g2u.tblOffset]; } else { uni = g2u.algOffset + (gb4lin & 0xFF); } } else if (InRange(gb4lin, 0x2E248, 0x12E247)) { /* GB+90308130 - GB+E3329A35 */ uni = gb4lin - 0x1E248; } else { /* undefined or reserved area */ len = 1; uni = QChar::replacement.unicode(); } } else { len = 1; uni = QChar::replacement.unicode(); } } else { len = 1; uni = QChar::replacement.unicode(); } } else { len = 1; uni = QChar::replacement.unicode(); } return uni; } int qt_UnicodeToGb18030(uint uni, uchar *gbchar) { /* Returns the bytesize of the GB18030 character. */ uint gb, gb4lin; indexTbl_t u2g; if ( IsLatin(uni) ) { *gbchar = (uchar)uni; return 1; } else if (uni <= 0xD7FF || InRange(uni, 0xE766, 0xFFFF)) { u2g = ucs_to_gb18030_index[uni >> 8]; if ((Q_UINT8)(uni & 0xFF) >= u2g.tblBegin && (Q_UINT8)(uni & 0xFF) <= u2g.tblEnd) { // Use mapping table (2-byte or 4-byte GB18030) uint tblEntry; tblEntry = ucs_to_gb18030[uni - u2g.tblOffset]; if (tblEntry > 0x8000) { // 2-byte GB18030 gb = tblEntry; } else { // 4-byte GB18030 stored in a special compact format uchar a, b; a = 0x81; b = 0x30 + (tblEntry >> 11); if (tblEntry >= 0x7000) { a += 3; b -= 14; } else if (tblEntry >= 0x6000) { a += 2; b -= 6; } else if (tblEntry >= 0x3000) { a += 1; b -= 6; } else if (b >= 0x31) { b += 5; } gbchar[0] = a; gbchar[1] = b; gbchar[2] = 0x81 + ( (tblEntry >> 4) & 0x7F ); gbchar[3] = 0x30 + (tblEntry & 0xF); return 4; } } else { // 4-byte GB18030 calculated algorithmically gb4lin = u2g.algOffset + (uni & 0xFF); // Yikes, my index table could not cover all the bases... if (InRange(uni, 0x49B8, 0x49FF)) gb4lin -= 11; gb = gb4lin_to_gb(gb4lin); } } else if (InRange(uni, 0xE000, 0xE765)) { // User-defined areas in GB18030 (2-byte) if (uni <= 0xE233) gb = 0xAAA1 + (((uni - 0xE000) / 94) << 8) + (uni - 0xE000) % 94; else if (uni <= 0xE4C5) gb = 0xF8A1 + (((uni - 0xE234) / 94) << 8) + (uni - 0xE234) % 94; else { gb = 0xA140 + (((uni - 0xE4C6) / 96) << 8) + (uni - 0xE4C6) % 96; // Skip the gap at 0x7F if ((gb & 0xFF) >= 0x7F) gb++; } } else if (InRange(uni, 0x10000, 0x10FFFF)) { // Qt 3.x does not support beyond BMP yet, but what the heck... // (U+10000 = GB+90308130) to (U+10FFFF = GB+E3329A35) gb = gb4lin_to_gb(0x1E248 + uni); } else { // Surrogate area and other undefined/reserved areas (discard) *gbchar = 0; return 0; } if (gb <= 0xFFFF) { gbchar[0] = (uchar)((gb >> 8) & 0xFF); gbchar[1] = (uchar)(gb & 0xFF); return 2; } else { gbchar[0] = (uchar)((gb >> 24) & 0xFF); gbchar[1] = (uchar)((gb >> 16) & 0xFF); gbchar[2] = (uchar)((gb >> 8) & 0xFF); gbchar[3] = (uchar)(gb & 0xFF); return 4; } } int qt_UnicodeToGbk(uint uni, uchar *gbchar) { /* Returns the bytesize of the GBK character. */ /* Intended for improving performance of GB2312 and GBK functions. */ uint gb; indexTbl_t u2g; if ( IsLatin(uni) ) { *gbchar = (uchar)uni; return 1; } else if (uni <= 0xD7FF || InRange(uni, 0xE766, 0xFFFF)) { u2g = ucs_to_gb18030_index[uni >> 8]; if ( (Q_UINT8)(uni & 0xFF) >= u2g.tblBegin && (Q_UINT8)(uni & 0xFF) <= u2g.tblEnd ) { // Use mapping table (2-byte GBK or 4-byte GB18030) uint tblEntry; tblEntry = ucs_to_gb18030[uni - u2g.tblOffset]; if (tblEntry > 0x8000) { // GBK gb = tblEntry; } else { // 4-byte GB18030 stored in a special compact format (discard) *gbchar = 0; return 0; } } else { // 4-byte GB18030 calculated algorithmically (discard) *gbchar = 0; return 0; } } else if (InRange(uni, 0xE000, 0xE765)) { // User-defined areas in GB18030 (2-byte) if (uni <= 0xE233) gb = 0xAAA1 + (((uni - 0xE000) / 94) << 8) + (uni - 0xE000) % 94; else if (uni <= 0xE4C5) gb = 0xF8A1 + (((uni - 0xE234) / 94) << 8) + (uni - 0xE234) % 94; else { gb = 0xA140 + (((uni - 0xE4C6) / 96) << 8) + (uni - 0xE4C6) % 96; // Skip the gap at 0x7F if ((gb & 0xFF) >= 0x7F) gb++; } } else { // Surrogate area and other undefined/reserved areas (discard) *gbchar = 0; return 0; } gbchar[0] = (uchar)((gb >> 8) & 0xFF); gbchar[1] = (uchar)(gb & 0xFF); return 2; } #endif