/*************************************************************************** copyright : (C) 2006 by Robby Stephenson email : robby@periapsis.org ***************************************************************************/ /*************************************************************************** * * * This program is free software; you can redistribute it and/or modify * * it under the terms of version 2 of the GNU General Public License as * * published by the Free Software Foundation; * * * ***************************************************************************/ // This class is adapted from Iso6937ToUnicode from the MARC4J project, available // from http://marc4j.tigris.org, with the following notice: // * Copyright (C) 2002 Bas Peters (mail@bpeters.com) // * Copyright (C) 2002 Yves Pratter (ypratter@club-internet.fr) // // That source was released under the terms of the GNU Lesser General Public // License, version 2.1. In accordance with Condition 3 of that license, // I am applying the terms of the GNU General Public License to the source // code, and including a large portion of it here #include "iso6937converter.h" #include "tellico_debug.h" #include <tqstring.h> using Tellico::Iso6937Converter; TQString Iso6937Converter::toUtf8(const TQCString& text_) { const uint len = text_.length(); TQString result; result.reserve(len); uint pos = 0; for(uint i = 0; i < len; ++i) { uchar c = text_[i]; if(isAscii(c)) { result[pos++] = c; } else if(isCombining(c) && hasNext(i, len)) { TQChar d = getCombiningChar(c * 256 + text_[i + 1]); if(!d.isNull()) { result[pos++] = d; ++i; } else { result[pos++] = getChar(c); } } else { result[pos++] = getChar(c); } } result.squeeze(); return result; } inline bool Iso6937Converter::hasNext(uint pos, uint len) { return pos < (len - 1); } inline bool Iso6937Converter::isAscii(uchar c) { return c <= 0x7F; } inline bool Iso6937Converter::isCombining(uchar c) { return c >= 0xC0 && c <= 0xDF; } // Source : http://anubis.dkuug.dk/JTC1/SC2/WG3/docs/6937cd.pdf TQChar Iso6937Converter::getChar(uchar c) { switch(c) { case 0xA0: return 0x00A0; // 10/00 NO-BREAK SPACE case 0xA1: return 0x00A1; // 10/01 INVERTED EXCLAMATION MARK case 0xA2: return 0x00A2; // 10/02 CENT SIGN case 0xA3: return 0x00A3; // 10/03 POUND SIGN // 10/04 (This position shall not be used) case 0xA5: return 0x00A5; // 10/05 YEN SIGN // 10/06 (This position shall not be used) case 0xA7: return 0x00A7; // 10/07 SECTION SIGN case 0xA8: return 0x00A4; // 10/08 CURRENCY SIGN case 0xA9: return 0x2018; // 10/09 LEFT SINGLE QUOTATION MARK case 0xAA: return 0x201C; // 10/10 LEFT DOUBLE QUOTATION MARK case 0xAB: return 0x00AB; // 10/11 LEFT-POINTING DOUBLE ANGLE QUOTATION MARK case 0xAC: return 0x2190; // 10/12 LEFTWARDS ARROW case 0xAD: return 0x2191; // 10/13 UPWARDS ARROW case 0xAE: return 0x2192; // 10/14 RIGHTWARDS ARROW case 0xAF: return 0x2193; // 10/15 DOWNWARDS ARROW case 0xB0: return 0x00B0; // 11/00 DEGREE SIGN case 0xB1: return 0x00B1; // 11/01 PLUS-MINUS SIGN case 0xB2: return 0x00B2; // 11/02 SUPERSCRIPT TWO case 0xB3: return 0x00B3; // 11/03 SUPERSCRIPT THREE case 0xB4: return 0x00D7; // 11/04 MULTIPLICATION SIGN case 0xB5: return 0x00B5; // 11/05 MICRO SIGN case 0xB6: return 0x00B6; // 11/06 PILCROW SIGN case 0xB7: return 0x00B7; // 11/07 MIDDLE DOT case 0xB8: return 0x00F7; // 11/08 DIVISION SIGN case 0xB9: return 0x2019; // 11/09 RIGHT SINGLE QUOTATION MARK case 0xBA: return 0x201D; // 11/10 RIGHT DOUBLE QUOTATION MARK case 0xBB: return 0x00BB; // 11/11 RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK case 0xBC: return 0x00BC; // 11/12 VULGAR FRACTION ONE QUARTER case 0xBD: return 0x00BD; // 11/13 VULGAR FRACTION ONE HALF case 0xBE: return 0x00BE; // 11/14 VULGAR FRACTION THREE QUARTERS case 0xBF: return 0x00BF; // 11/15 INVERTED QUESTION MARK // 4/0 to 5/15 diacritic characters case 0xD0: return 0x2015; // 13/00 HORIZONTAL BAR case 0xD1: return 0x00B9; // 13/01 SUPERSCRIPT ONE case 0xD2: return 0x2117; // 13/02 REGISTERED SIGN case 0xD3: return 0x00A9; // 13/03 COPYRIGHT SIGN case 0xD4: return 0x00AE; // 13/04 TRADE MARK SIGN case 0xD5: return 0x266A; // 13/05 EIGHTH NOTE case 0xD6: return 0x00AC; // 13/06 NOT SIGN case 0xD7: return 0x00A6; // 13/07 BROKEN BAR // 13/08 (This position shall not be used) // 13/09 (This position shall not be used) // 13/10 (This position shall not be used) // 13/11 (This position shall not be used) case 0xDC: return 0x215B; // 13/12 VULGAR FRACTION ONE EIGHTH case 0xDF: return 0x215E; // 13/15 VULGAR FRACTION SEVEN EIGHTHS case 0xE0: return 0x2126; // 14/00 OHM SIGN case 0xE1: return 0x00C6; // 14/01 LATIN CAPITAL LETTER AE case 0xE2: return 0x0110; // 14/02 LATIN CAPITAL LETTER D WITH STROKE case 0xE3: return 0x00AA; // 14/03 FEMININE ORDINAL INDICATOR case 0xE4: return 0x0126; // 14/04 LATIN CAPITAL LETTER H WITH STROKE // 14/05 (This position shall not be used) case 0xE6: return 0x0132; // 14/06 LATIN CAPITAL LIGATURE IJ case 0xE7: return 0x013F; // 14/07 LATIN CAPITAL LETTER L WITH MIDDLE DOT case 0xE8: return 0x0141; // 14/08 LATIN CAPITAL LETTER L WITH STROKE case 0xE9: return 0x00D8; // 14/09 LATIN CAPITAL LETTER O WITH STROKE case 0xEA: return 0x0152; // 14/10 LATIN CAPITAL LIGATURE OE case 0xEB: return 0x00BA; // 14/11 MASCULINE ORDINAL INDICATOR case 0xEC: return 0x00DE; // 14/12 LATIN CAPITAL LETTER THORN case 0xED: return 0x0166; // 14/13 LATIN CAPITAL LETTER T WITH STROKE case 0xEE: return 0x014A; // 14/14 LATIN CAPITAL LETTER ENG case 0xEF: return 0x0149; // 14/15 LATIN SMALL LETTER N PRECEDED BY APOSTROPHE case 0xF0: return 0x0138; // 15/00 LATIN SMALL LETTER KRA case 0xF1: return 0x00E6; // 15/01 LATIN SMALL LETTER AE case 0xF2: return 0x0111; // 15/02 LATIN SMALL LETTER D WITH STROKE case 0xF3: return 0x00F0; // 15/03 LATIN SMALL LETTER ETH case 0xF4: return 0x0127; // 15/04 LATIN SMALL LETTER H WITH STROKE case 0xF5: return 0x0131; // 15/05 LATIN SMALL LETTER DOTLESS I case 0xF6: return 0x0133; // 15/06 LATIN SMALL LIGATURE IJ case 0xF7: return 0x0140; // 15/07 LATIN SMALL LETTER L WITH MIDDLE DOT case 0xF8: return 0x0142; // 15/08 LATIN SMALL LETTER L WITH STROKE case 0xF9: return 0x00F8; // 15/09 LATIN SMALL LETTER O WITH STROKE case 0xFA: return 0x0153; // 15/10 LATIN SMALL LIGATURE OE case 0xFB: return 0x00DF; // 15/11 LATIN SMALL LETTER SHARP S case 0xFC: return 0x00FE; // 15/12 LATIN SMALL LETTER THORN case 0xFD: return 0x0167; // 15/13 LATIN SMALL LETTER T WITH STROKE case 0xFE: return 0x014B; // 15/14 LATIN SMALL LETTER ENG case 0xFF: return 0x00AD; // 15/15 SOFT HYPHEN$ default: return c; } } TQChar Iso6937Converter::getCombiningChar(uint c) { switch(c) { // 12/00 (This position shall not be used) // 12/01 non-spacing grave accent case 0xC141: return 0x00C0; // LATIN CAPITAL LETTER A WITH GRAVE case 0xC145: return 0x00C8; // LATIN CAPITAL LETTER E WITH GRAVE case 0xC149: return 0x00CC; // LATIN CAPITAL LETTER I WITH GRAVE case 0xC14F: return 0x00D2; // LATIN CAPITAL LETTER O WITH GRAVE case 0xC155: return 0x00D9; // LATIN CAPITAL LETTER U WITH GRAVE case 0xC161: return 0x00E0; // LATIN SMALL LETTER A WITH GRAVE case 0xC165: return 0x00E8; // LATIN SMALL LETTER E WITH GRAVE case 0xC169: return 0x00EC; // LATIN SMALL LETTER I WITH GRAVE case 0xC16F: return 0x00F2; // LATIN SMALL LETTER O WITH GRAVE case 0xC175: return 0x00F9; // LATIN SMALL LETTER U WITH GRAVE // 12/02 non-spacing acute accent case 0xC220: return 0x00B4; // ACUTE ACCENT case 0xC241: return 0x00C1; // LATIN CAPITAL LETTER A WITH ACUTE case 0xC243: return 0x0106; // LATIN CAPITAL LETTER C WITH ACUTE case 0xC245: return 0x00C9; // LATIN CAPITAL LETTER E WITH ACUTE case 0xC249: return 0x00CD; // LATIN CAPITAL LETTER I WITH ACUTE case 0xC24C: return 0x0139; // LATIN CAPITAL LETTER L WITH ACUTE case 0xC24E: return 0x0143; // LATIN CAPITAL LETTER N WITH ACUTE case 0xC24F: return 0x00D3; // LATIN CAPITAL LETTER O WITH ACUTE case 0xC252: return 0x0154; // LATIN CAPITAL LETTER R WITH ACUTE case 0xC253: return 0x015A; // LATIN CAPITAL LETTER S WITH ACUTE case 0xC255: return 0x00DA; // LATIN CAPITAL LETTER U WITH ACUTE case 0xC259: return 0x00DD; // LATIN CAPITAL LETTER Y WITH ACUTE case 0xC25A: return 0x0179; // LATIN CAPITAL LETTER Z WITH ACUTE case 0xC261: return 0x00E1; // LATIN SMALL LETTER A WITH ACUTE case 0xC263: return 0x0107; // LATIN SMALL LETTER C WITH ACUTE case 0xC265: return 0x00E9; // LATIN SMALL LETTER E WITH ACUTE case 0xC267: return 0x01F5; // LATIN SMALL LETTER G WITH CEDILLA(4) case 0xC269: return 0x00ED; // LATIN SMALL LETTER I WITH ACUTE case 0xC26C: return 0x013A; // LATIN SMALL LETTER L WITH ACUTE case 0xC26E: return 0x0144; // LATIN SMALL LETTER N WITH ACUTE case 0xC26F: return 0x00F3; // LATIN SMALL LETTER O WITH ACUTE case 0xC272: return 0x0155; // LATIN SMALL LETTER R WITH ACUTE case 0xC273: return 0x015B; // LATIN SMALL LETTER S WITH ACUTE case 0xC275: return 0x00FA; // LATIN SMALL LETTER U WITH ACUTE case 0xC279: return 0x00FD; // LATIN SMALL LETTER Y WITH ACUTE case 0xC27A: return 0x017A; // LATIN SMALL LETTER Z WITH ACUTE // 12/03 non-spacing circumflex accent case 0xC341: return 0x00C2; // LATIN CAPITAL LETTER A WITH CIRCUMFLEX case 0xC343: return 0x0108; // LATIN CAPITAL LETTER C WITH CIRCUMFLEX case 0xC345: return 0x00CA; // LATIN CAPITAL LETTER E WITH CIRCUMFLEX case 0xC347: return 0x011C; // LATIN CAPITAL LETTER G WITH CIRCUMFLEX case 0xC348: return 0x0124; // LATIN CAPITAL LETTER H WITH CIRCUMFLEX case 0xC349: return 0x00CE; // LATIN CAPITAL LETTER I WITH CIRCUMFLEX case 0xC34A: return 0x0134; // LATIN CAPITAL LETTER J WITH CIRCUMFLEX case 0xC34F: return 0x00D4; // LATIN CAPITAL LETTER O WITH CIRCUMFLEX case 0xC353: return 0x015C; // LATIN CAPITAL LETTER S WITH CIRCUMFLEX case 0xC355: return 0x00DB; // LATIN CAPITAL LETTER U WITH CIRCUMFLEX case 0xC357: return 0x0174; // LATIN CAPITAL LETTER W WITH CIRCUMFLEX case 0xC359: return 0x0176; // LATIN CAPITAL LETTER Y WITH CIRCUMFLEX case 0xC361: return 0x00E2; // LATIN SMALL LETTER A WITH CIRCUMFLEX case 0xC363: return 0x0109; // LATIN SMALL LETTER C WITH CIRCUMFLEX case 0xC365: return 0x00EA; // LATIN SMALL LETTER E WITH CIRCUMFLEX case 0xC367: return 0x011D; // LATIN SMALL LETTER G WITH CIRCUMFLEX case 0xC368: return 0x0125; // LATIN SMALL LETTER H WITH CIRCUMFLEX case 0xC369: return 0x00EE; // LATIN SMALL LETTER I WITH CIRCUMFLEX case 0xC36A: return 0x0135; // LATIN SMALL LETTER J WITH CIRCUMFLEX case 0xC36F: return 0x00F4; // LATIN SMALL LETTER O WITH CIRCUMFLEX case 0xC373: return 0x015D; // LATIN SMALL LETTER S WITH CIRCUMFLEX case 0xC375: return 0x00FB; // LATIN SMALL LETTER U WITH CIRCUMFLEX case 0xC377: return 0x0175; // LATIN SMALL LETTER W WITH CIRCUMFLEX case 0xC379: return 0x0177; // LATIN SMALL LETTER Y WITH CIRCUMFLEX // 12/04 non-spacing tilde case 0xC441: return 0x00C3; // LATIN CAPITAL LETTER A WITH TILDE case 0xC449: return 0x0128; // LATIN CAPITAL LETTER I WITH TILDE case 0xC44E: return 0x00D1; // LATIN CAPITAL LETTER N WITH TILDE case 0xC44F: return 0x00D5; // LATIN CAPITAL LETTER O WITH TILDE case 0xC455: return 0x0168; // LATIN CAPITAL LETTER U WITH TILDE case 0xC461: return 0x00E3; // LATIN SMALL LETTER A WITH TILDE case 0xC469: return 0x0129; // LATIN SMALL LETTER I WITH TILDE case 0xC46E: return 0x00F1; // LATIN SMALL LETTER N WITH TILDE case 0xC46F: return 0x00F5; // LATIN SMALL LETTER O WITH TILDE case 0xC475: return 0x0169; // LATIN SMALL LETTER U WITH TILDE // 12/05 non-spacing macron case 0xC541: return 0x0100; // LATIN CAPITAL LETTER A WITH MACRON case 0xC545: return 0x0112; // LATIN CAPITAL LETTER E WITH MACRON case 0xC549: return 0x012A; // LATIN CAPITAL LETTER I WITH MACRON case 0xC54F: return 0x014C; // LATIN CAPITAL LETTER O WITH MACRON case 0xC555: return 0x016A; // LATIN CAPITAL LETTER U WITH MACRON case 0xC561: return 0x0101; // LATIN SMALL LETTER A WITH MACRON case 0xC565: return 0x0113; // LATIN SMALL LETTER E WITH MACRON case 0xC569: return 0x012B; // LATIN SMALL LETTER I WITH MACRON case 0xC56F: return 0x014D; // LATIN SMALL LETTER O WITH MACRON case 0xC575: return 0x016B; // LATIN SMALL LETTER U WITH MACRON // 12/06 non-spacing breve case 0xC620: return 0x02D8; // BREVE case 0xC641: return 0x0102; // LATIN CAPITAL LETTER A WITH BREVE case 0xC647: return 0x011E; // LATIN CAPITAL LETTER G WITH BREVE case 0xC655: return 0x016C; // LATIN CAPITAL LETTER U WITH BREVE case 0xC661: return 0x0103; // LATIN SMALL LETTER A WITH BREVE case 0xC667: return 0x011F; // LATIN SMALL LETTER G WITH BREVE case 0xC675: return 0x016D; // LATIN SMALL LETTER U WITH BREVE // 12/07 non-spacing dot above case 0xC743: return 0x010A; // LATIN CAPITAL LETTER C WITH DOT ABOVE case 0xC745: return 0x0116; // LATIN CAPITAL LETTER E WITH DOT ABOVE case 0xC747: return 0x0120; // LATIN CAPITAL LETTER G WITH DOT ABOVE case 0xC749: return 0x0130; // LATIN CAPITAL LETTER I WITH DOT ABOVE case 0xC75A: return 0x017B; // LATIN CAPITAL LETTER Z WITH DOT ABOVE case 0xC763: return 0x010B; // LATIN SMALL LETTER C WITH DOT ABOVE case 0xC765: return 0x0117; // LATIN SMALL LETTER E WITH DOT ABOVE case 0xC767: return 0x0121; // LATIN SMALL LETTER G WITH DOT ABOVE case 0xC77A: return 0x017C; // LATIN SMALL LETTER Z WITH DOT ABOVE // 12/08 non-spacing diaeresis case 0xC820: return 0x00A8; // DIAERESIS case 0xC841: return 0x00C4; // LATIN CAPITAL LETTER A WITH DIAERESIS case 0xC845: return 0x00CB; // LATIN CAPITAL LETTER E WITH DIAERESIS case 0xC849: return 0x00CF; // LATIN CAPITAL LETTER I WITH DIAERESIS case 0xC84F: return 0x00D6; // LATIN CAPITAL LETTER O WITH DIAERESIS case 0xC855: return 0x00DC; // LATIN CAPITAL LETTER U WITH DIAERESIS case 0xC859: return 0x0178; // LATIN CAPITAL LETTER Y WITH DIAERESIS case 0xC861: return 0x00E4; // LATIN SMALL LETTER A WITH DIAERESIS case 0xC865: return 0x00EB; // LATIN SMALL LETTER E WITH DIAERESIS case 0xC869: return 0x00EF; // LATIN SMALL LETTER I WITH DIAERESIS case 0xC86F: return 0x00F6; // LATIN SMALL LETTER O WITH DIAERESIS case 0xC875: return 0x00FC; // LATIN SMALL LETTER U WITH DIAERESIS case 0xC879: return 0x00FF; // LATIN SMALL LETTER Y WITH DIAERESIS // 12/09 (This position shall not be used) // 12/10 non-spacing ring above case 0xCA20: return 0x02DA; // RING ABOVE case 0xCA41: return 0x00C5; // LATIN CAPITAL LETTER A WITH RING ABOVE case 0xCAAD: return 0x016E; // LATIN CAPITAL LETTER U WITH RING ABOVE case 0xCA61: return 0x00E5; // LATIN SMALL LETTER A WITH RING ABOVE case 0xCA75: return 0x016F; // LATIN SMALL LETTER U WITH RING ABOVE // 12/11 non-spacing cedilla case 0xCB20: return 0x00B8; // CEDILLA case 0xCB43: return 0x00C7; // LATIN CAPITAL LETTER C WITH CEDILLA case 0xCB47: return 0x0122; // LATIN CAPITAL LETTER G WITH CEDILLA case 0xCB4B: return 0x0136; // LATIN CAPITAL LETTER K WITH CEDILLA case 0xCB4C: return 0x013B; // LATIN CAPITAL LETTER L WITH CEDILLA case 0xCB4E: return 0x0145; // LATIN CAPITAL LETTER N WITH CEDILLA case 0xCB52: return 0x0156; // LATIN CAPITAL LETTER R WITH CEDILLA case 0xCB53: return 0x015E; // LATIN CAPITAL LETTER S WITH CEDILLA case 0xCB54: return 0x0162; // LATIN CAPITAL LETTER T WITH CEDILLA case 0xCB63: return 0x00E7; // LATIN SMALL LETTER C WITH CEDILLA // case 0xCB67: return 0x0123; // small g with cedilla case 0xCB6B: return 0x0137; // LATIN SMALL LETTER K WITH CEDILLA case 0xCB6C: return 0x013C; // LATIN SMALL LETTER L WITH CEDILLA case 0xCB6E: return 0x0146; // LATIN SMALL LETTER N WITH CEDILLA case 0xCB72: return 0x0157; // LATIN SMALL LETTER R WITH CEDILLA case 0xCB73: return 0x015F; // LATIN SMALL LETTER S WITH CEDILLA case 0xCB74: return 0x0163; // LATIN SMALL LETTER T WITH CEDILLA // 12/12 (This position shall not be used) // 12/13 non-spacing double acute accent case 0xCD4F: return 0x0150; // LATIN CAPITAL LETTER O WITH DOUBLE ACUTE case 0xCD55: return 0x0170; // LATIN CAPITAL LETTER U WITH DOUBLE ACUTE case 0xCD6F: return 0x0151; // LATIN SMALL LETTER O WITH DOUBLE ACUTE case 0xCD75: return 0x0171; // LATIN SMALL LETTER U WITH DOUBLE ACUTE // 12/14 non-spacing ogonek case 0xCE20: return 0x02DB; // ogonek case 0xCE41: return 0x0104; // LATIN CAPITAL LETTER A WITH OGONEK case 0xCE45: return 0x0118; // LATIN CAPITAL LETTER E WITH OGONEK case 0xCE49: return 0x012E; // LATIN CAPITAL LETTER I WITH OGONEK case 0xCE55: return 0x0172; // LATIN CAPITAL LETTER U WITH OGONEK case 0xCE61: return 0x0105; // LATIN SMALL LETTER A WITH OGONEK case 0xCE65: return 0x0119; // LATIN SMALL LETTER E WITH OGONEK case 0xCE69: return 0x012F; // LATIN SMALL LETTER I WITH OGONEK case 0xCE75: return 0x0173; // LATIN SMALL LETTER U WITH OGONEK // 12/15 non-spacing caron case 0xCF20: return 0x02C7; // CARON case 0xCF43: return 0x010C; // LATIN CAPITAL LETTER C WITH CARON case 0xCF44: return 0x010E; // LATIN CAPITAL LETTER D WITH CARON case 0xCF45: return 0x011A; // LATIN CAPITAL LETTER E WITH CARON case 0xCF4C: return 0x013D; // LATIN CAPITAL LETTER L WITH CARON case 0xCF4E: return 0x0147; // LATIN CAPITAL LETTER N WITH CARON case 0xCF52: return 0x0158; // LATIN CAPITAL LETTER R WITH CARON case 0xCF53: return 0x0160; // LATIN CAPITAL LETTER S WITH CARON case 0xCF54: return 0x0164; // LATIN CAPITAL LETTER T WITH CARON case 0xCF5A: return 0x017D; // LATIN CAPITAL LETTER Z WITH CARON case 0xCF63: return 0x010D; // LATIN SMALL LETTER C WITH CARON case 0xCF64: return 0x010F; // LATIN SMALL LETTER D WITH CARON case 0xCF65: return 0x011B; // LATIN SMALL LETTER E WITH CARON case 0xCF6C: return 0x013E; // LATIN SMALL LETTER L WITH CARON case 0xCF6E: return 0x0148; // LATIN SMALL LETTER N WITH CARON case 0xCF72: return 0x0159; // LATIN SMALL LETTER R WITH CARON case 0xCF73: return 0x0161; // LATIN SMALL LETTER S WITH CARON case 0xCF74: return 0x0165; // LATIN SMALL LETTER T WITH CARON case 0xCF7A: return 0x017E; // LATIN SMALL LETTER Z WITH CARON default: myDebug() << "Iso6937Converter::getCombiningChar() - no match for " << c << endl; return TQChar(); } }