diff options
author | Slávek Banko <[email protected]> | 2013-06-24 02:08:15 +0200 |
---|---|---|
committer | Slávek Banko <[email protected]> | 2013-07-04 02:44:37 +0200 |
commit | 998f21e02a725cd553d7c278819f67cd81295af4 (patch) | |
tree | 4bd158018e9302c31367b00c01cd2b41eb228414 /src/encoderlatex.cpp | |
download | kbibtex-998f21e02a725cd553d7c278819f67cd81295af4.tar.gz kbibtex-998f21e02a725cd553d7c278819f67cd81295af4.zip |
Initial import
Diffstat (limited to 'src/encoderlatex.cpp')
-rw-r--r-- | src/encoderlatex.cpp | 876 |
1 files changed, 876 insertions, 0 deletions
diff --git a/src/encoderlatex.cpp b/src/encoderlatex.cpp new file mode 100644 index 0000000..f111848 --- /dev/null +++ b/src/encoderlatex.cpp @@ -0,0 +1,876 @@ +/*************************************************************************** +* Copyright (C) 2004-2009 by Thomas Fischer * +* [email protected] * +* * +* This program is free software; you can redistribute it and/or modify * +* it under the terms of the GNU General Public License as published by * +* the Free Software Foundation; either version 2 of the License, or * +* (at your option) any later version. * +* * +* This program is distributed in the hope that it will be useful, * +* but WITHOUT ANY WARRANTY; without even the implied warranty of * +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * +* GNU General Public License for more details. * +* * +* You should have received a copy of the GNU General Public License * +* along with this program; if not, write to the * +* Free Software Foundation, Inc., * +* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * +***************************************************************************/ +#include <qstring.h> +#include <qapplication.h> +#include <qregexp.h> + +#include "encoderlatex.h" + +namespace BibTeX +{ + EncoderLaTeX *EncoderLaTeX::encoderLaTeX = NULL; + + static struct Decomposition + { + const char *latexCommand; + unsigned int unicode; + } + decompositions[] = + { + {"`", 0x0300}, + {"'", 0x0301}, + {"^", 0x0302}, + {"~", 0x0303}, + {"=", 0x0304}, + /*{"x", 0x0305}, OVERLINE */ + {"u", 0x0306}, + {".", 0x0307}, + /*{"x", 0x0309}, HOOK ABOVE */ + {"r", 0x030a}, + {"H", 0x030b}, + {"v", 0x030c}, + /*{"x", 0x030d}, VERTICAL LINE ABOVE */ + /*{"x", 0x030e}, DOUBLE VERTICAL LINE ABOVE */ + /*{"x", 0x030f}, DOUBLE GRAVE ACCENT */ + /*{"x", 0x0310}, CANDRABINDU */ + /*{"x", 0x0311}, INVERTED BREVE */ + /*{"x", 0x0312}, TURNED COMMA ABOVE */ + /*{"x", 0x0313}, COMMA ABOVE */ + /*{"x", 0x0314}, REVERSED COMMA ABOVE */ + /*{"x", 0x0315}, */ + /*{"x", 0x0316}, */ + /*{"x", 0x0317}, */ + /*{"x", 0x0318}, */ + /*{"x", 0x0319}, */ + /*{"x", 0x031a}, */ + /*{"x", 0x031b}, */ + /*{"x", 0x031c}, */ + /*{"x", 0x031d}, */ + /*{"x", 0x031e}, */ + /*{"x", 0x031f}, */ + /*{"x", 0x0320}, */ + /*{"x", 0x0321}, */ + /*{"x", 0x0322}, */ + {"d", 0x0323}, + /*{"x", 0x0324}, */ + /*{"x", 0x0325}, */ + /*{"x", 0x0326}, */ + {"d", 0x0327}, + {"k", 0x0328}, + /*{"x", 0x0329}, */ + /*{"x", 0x032a}, */ + /*{"x", 0x032b}, */ + /*{"x", 0x032c}, */ + /*{"x", 0x032d}, */ + /*{"x", 0x032e}, */ + /*{"x", 0x032f}, */ + {"b", 0x0331}, + {"t", 0x0361} + }; + + static const int decompositionscount = sizeof( decompositions ) / sizeof( decompositions[ 0 ] ) ; + + static const struct EncoderLaTeXCommandMapping + { + const char *letters; + unsigned int unicode; + } + commandmappingdatalatex[] = + { + {"AA", 0x00C5}, + {"AE", 0x00C6}, + {"ss", 0x00DF}, + {"aa", 0x00E5}, + {"ae", 0x00E6}, + {"OE", 0x0152}, + {"oe", 0x0153}, + {"ldots", 0x2026}, + {"L", 0x0141}, + {"l", 0x0142}, + {"grqq", 0x201C}, + {"glqq", 0x201E}, + {"frqq", 0x00BB}, + {"flqq", 0x00AB}, + +// awk -F '[{}\\\\]+' '/DeclareUnicodeCharacter/ { print "{\""$4"\", 0x"$3"},"}' /usr/share/texmf-dist/tex/latex/base/t2aenc.dfu | grep '0x04' | sort -r -f + {"cyrzhdsc", 0x0497}, + {"CYRZHDSC", 0x0496}, + {"cyrzh", 0x0436}, + {"CYRZH", 0x0416}, + {"cyrzdsc", 0x0499}, + {"CYRZDSC", 0x0498}, + {"cyrz", 0x0437}, + {"CYRZ", 0x0417}, + {"cyryu", 0x044E}, + {"CYRYU", 0x042E}, + {"cyryo", 0x0451}, + {"CYRYO", 0x0401}, + {"cyryi", 0x0457}, + {"CYRYI", 0x0407}, + {"cyryhcrs", 0x04B1}, + {"CYRYHCRS", 0x04B0}, + {"cyrya", 0x044F}, + {"CYRYA", 0x042F}, + {"cyry", 0x04AF}, + {"CYRY", 0x04AE}, + {"cyrv", 0x0432}, + {"CYRV", 0x0412}, + {"cyrushrt", 0x045E}, + {"CYRUSHRT", 0x040E}, + {"cyru", 0x0443}, + {"CYRU", 0x0423}, + {"cyrtshe", 0x045B}, + {"CYRTSHE", 0x040B}, + {"cyrtdsc", 0x04AD}, + {"CYRTDSC", 0x04AC}, + {"cyrt", 0x0442}, + {"CYRT", 0x0422}, + {"cyrshha", 0x04BB}, + {"CYRSHHA", 0x04BA}, + {"cyrshch", 0x0449}, + {"CYRSHCH", 0x0429}, + {"cyrsh", 0x0448}, + {"CYRSH", 0x0428}, + {"cyrsftsn", 0x044C}, + {"CYRSFTSN", 0x042C}, + {"cyrsdsc", 0x04AB}, + {"CYRSDSC", 0x04AA}, + {"cyrschwa", 0x04D9}, + {"CYRSCHWA", 0x04D8}, + {"cyrs", 0x0441}, + {"CYRS", 0x0421}, + {"cyrr", 0x0440}, + {"CYRR", 0x0420}, + {"CYRpalochka", 0x04C0}, + {"cyrp", 0x043F}, + {"CYRP", 0x041F}, + {"cyrotld", 0x04E9}, + {"CYROTLD", 0x04E8}, + {"cyro", 0x043E}, + {"CYRO", 0x041E}, + {"cyrnje", 0x045A}, + {"CYRNJE", 0x040A}, + {"cyrng", 0x04A5}, + {"CYRNG", 0x04A4}, + {"cyrndsc", 0x04A3}, + {"CYRNDSC", 0x04A2}, + {"cyrn", 0x043D}, + {"CYRN", 0x041D}, + {"cyrm", 0x043C}, + {"CYRM", 0x041C}, + {"cyrlje", 0x0459}, + {"CYRLJE", 0x0409}, + {"cyrl", 0x043B}, + {"CYRL", 0x041B}, + {"cyrkvcrs", 0x049D}, + {"CYRKVCRS", 0x049C}, + {"cyrkdsc", 0x049B}, + {"CYRKDSC", 0x049A}, + {"cyrk", 0x043A}, + {"CYRK", 0x041A}, + {"cyrje", 0x0458}, + {"CYRJE", 0x0408}, + {"cyrishrt", 0x0439}, + {"CYRISHRT", 0x0419}, + {"cyrii", 0x0456}, + {"CYRII", 0x0406}, + {"cyrie", 0x0454}, + {"CYRIE", 0x0404}, + {"cyri", 0x0438}, + {"CYRI", 0x0418}, + {"cyrhrdsn", 0x044A}, + {"CYRHRDSN", 0x042A}, + {"cyrhdsc", 0x04B3}, + {"CYRHDSC", 0x04B2}, + {"cyrh", 0x0445}, + {"CYRH", 0x0425}, + {"cyrgup", 0x0491}, + {"CYRGUP", 0x0490}, + {"cyrghcrs", 0x0493}, + {"CYRGHCRS", 0x0492}, + {"cyrg", 0x0433}, + {"CYRG", 0x0413}, + {"cyrf", 0x0444}, + {"CYRF", 0x0424}, + {"cyrery", 0x044B}, + {"CYRERY", 0x042B}, + {"cyrerev", 0x044D}, + {"CYREREV", 0x042D}, + {"cyre", 0x0435}, + {"CYRE", 0x0415}, + {"cyrdzhe", 0x045F}, + {"CYRDZHE", 0x040F}, + {"cyrdze", 0x0455}, + {"CYRDZE", 0x0405}, + {"cyrdje", 0x0452}, + {"CYRDJE", 0x0402}, + {"cyrd", 0x0434}, + {"CYRD", 0x0414}, + {"cyrchvcrs", 0x04B9}, + {"CYRCHVCRS", 0x04B8}, + {"cyrchrdsc", 0x04B7}, + {"CYRCHRDSC", 0x04B6}, + {"cyrch", 0x0447}, + {"CYRCH", 0x0427}, + {"cyrc", 0x0446}, + {"CYRC", 0x0426}, + {"cyrb", 0x0431}, + {"CYRB", 0x0411}, + {"cyrae", 0x04D5}, + {"CYRAE", 0x04D4}, + {"cyra", 0x0430}, + {"CYRA", 0x0410} + }; + + static const int commandmappingdatalatexcount = sizeof( commandmappingdatalatex ) / sizeof( commandmappingdatalatex[ 0 ] ) ; + + /** Command can be either + (1) {embraced} + (2) delimited by {}, + (3) <space>, line end, + (4) \following_command (including \<space>, which must be maintained!), + (5) } (end of entry or group) + **/ + const char *expansionsCmd[] = {"\\{\\\\%1\\}", "\\\\%1\\{\\}", "\\\\%1(\\n|\\r|\\\\|\\})", "\\\\%1\\s"}; + static const int expansionscmdcount = sizeof( expansionsCmd ) / sizeof( expansionsCmd[0] ); + + static const struct EncoderLaTeXModCharMapping + { + const char *modifier; + const char *letter; + unsigned int unicode; + } + modcharmappingdatalatex[] = + { + {"\\\\`", "A", 0x00C0}, + {"\\\\'", "A", 0x00C1}, + {"\\\\\\^", "A", 0x00C2}, + {"\\\\~", "A", 0x00C3}, + {"\\\\\"", "A", 0x00C4}, + {"\\\\r", "A", 0x00C5}, + /** 0x00C6 */ + {"\\\\c", "C", 0x00C7}, + {"\\\\`", "E", 0x00C8}, + {"\\\\'", "E", 0x00C9}, + {"\\\\\\^", "E", 0x00CA}, + {"\\\\\"", "E", 0x00CB}, + {"\\\\`", "I", 0x00CC}, + {"\\\\'", "I", 0x00CD}, + {"\\\\\\^", "I", 0x00CE}, + {"\\\\\"", "I", 0x00CF}, + /** 0x00D0 */ + {"\\\\~", "N", 0x00D1}, + {"\\\\`", "O", 0x00D2}, + {"\\\\'", "O", 0x00D3}, + {"\\\\\\^", "O", 0x00D4}, + /** 0x00D5 */ + {"\\\\\"", "O", 0x00D6}, + /** 0x00D7 */ + {"\\\\", "O", 0x00D8}, + {"\\\\`", "U", 0x00D9}, + {"\\\\'", "U", 0x00DA}, + {"\\\\\\^", "U", 0x00DB}, + {"\\\\\"", "U", 0x00DC}, + {"\\\\'", "Y", 0x00DD}, + /** 0x00DE */ + {"\\\\\"", "s", 0x00DF}, + {"\\\\`", "a", 0x00E0}, + {"\\\\'", "a", 0x00E1}, + {"\\\\\\^", "a", 0x00E2}, + {"\\\\~", "a", 0x00E3}, + {"\\\\\"", "a", 0x00E4}, + {"\\\\r", "a", 0x00E5}, + /** 0x00E6 */ + {"\\\\c", "c", 0x00E7}, + {"\\\\`", "e", 0x00E8}, + {"\\\\'", "e", 0x00E9}, + {"\\\\\\^", "e", 0x00EA}, + {"\\\\\"", "e", 0x00EB}, + {"\\\\`", "i", 0x00EC}, + {"\\\\'", "i", 0x00ED}, + {"\\\\'", "\\\\i", 0x00ED}, + {"\\\\\\^", "i", 0x00EE}, + /** 0x00EF */ + /** 0x00F0 */ + {"\\\\~", "n", 0x00F1}, + {"\\\\`", "o", 0x00F2}, + {"\\\\'", "o", 0x00F3}, + {"\\\\\\^", "o", 0x00F4}, + /** 0x00F5 */ + {"\\\\\"", "o", 0x00F6}, + /** 0x00F7 */ + {"\\\\", "o", 0x00F8}, + {"\\\\`", "u", 0x00F9}, + {"\\\\'", "u", 0x00FA}, + {"\\\\\\^", "u", 0x00FB}, + {"\\\\\"", "u", 0x00FC}, + {"\\\\'", "y", 0x00FD}, + /** 0x00FE */ + /** 0x00FF */ + /** 0x0100 */ + /** 0x0101 */ + {"\\\\u", "A", 0x0102}, + {"\\\\u", "a", 0x0103}, + /** 0x0104 */ + /** 0x0105 */ + {"\\\\'", "C", 0x0106}, + {"\\\\'", "c", 0x0107}, + /** 0x0108 */ + /** 0x0109 */ + /** 0x010A */ + /** 0x010B */ + {"\\\\v", "C", 0x010C}, + {"\\\\v", "c", 0x010D}, + {"\\\\v", "D", 0x010E}, + /** 0x010F */ + /** 0x0110 */ + /** 0x0111 */ + /** 0x0112 */ + /** 0x0113 */ + /** 0x0114 */ + /** 0x0115 */ + /** 0x0116 */ + /** 0x0117 */ + {"\\\\c", "E", 0x0118}, + {"\\\\c", "e", 0x0119}, + {"\\\\v", "E", 0x011A}, + {"\\\\v", "e", 0x011B}, + /** 0x011C */ + /** 0x011D */ + {"\\\\u", "G", 0x011E}, + {"\\\\u", "g", 0x011F}, + /** 0x0120 */ + /** 0x0121 */ + /** 0x0122 */ + /** 0x0123 */ + /** 0x0124 */ + /** 0x0125 */ + /** 0x0126 */ + /** 0x0127 */ + /** 0x0128 */ + /** 0x0129 */ + /** 0x012A */ + /** 0x012B */ + {"\\\\u", "I", 0x012C}, + {"\\\\u", "i", 0x012D}, + /** 0x012E */ + /** 0x012F */ + /** 0x0130 */ + /** 0x0131 */ + /** 0x0132 */ + /** 0x0133 */ + /** 0x0134 */ + /** 0x0135 */ + /** 0x0136 */ + /** 0x0137 */ + /** 0x0138 */ + {"\\\\'", "L", 0x0139}, + {"\\\\'", "l", 0x013A}, + /** 0x013B */ + /** 0x013C */ + /** 0x013D */ + /** 0x013E */ + /** 0x013F */ + /** 0x0140 */ + /** 0x0141 */ + /** 0x0142 */ + {"\\\\'", "N", 0x0143}, + {"\\\\'", "n", 0x0144}, + /** 0x0145 */ + /** 0x0146 */ + {"\\\\v", "N", 0x0147}, + {"\\\\v", "n", 0x0148}, + /** 0x0149 */ + /** 0x014A */ + /** 0x014B */ + /** 0x014C */ + /** 0x014D */ + {"\\\\u", "O", 0x014E}, + {"\\\\u", "o", 0x014F}, + {"\\\\H", "O", 0x0150}, + {"\\\\H", "o", 0x0151}, + /** 0x0152 */ + /** 0x0153 */ + {"\\\\'", "R", 0x0154}, + {"\\\\'", "r", 0x0155}, + /** 0x0156 */ + /** 0x0157 */ + {"\\\\v", "R", 0x0158}, + {"\\\\v", "r", 0x0159}, + {"\\\\'", "S", 0x015A}, + {"\\\\'", "s", 0x015B}, + /** 0x015C */ + /** 0x015D */ + {"\\\\c", "S", 0x015E}, + {"\\\\c", "s", 0x015F}, + {"\\\\v", "S", 0x0160}, + {"\\\\v", "s", 0x0161}, + /** 0x0162 */ + /** 0x0163 */ + {"\\\\v", "T", 0x0164}, + /** 0x0165 */ + /** 0x0166 */ + /** 0x0167 */ + /** 0x0168 */ + /** 0x0169 */ + /** 0x016A */ + /** 0x016B */ + {"\\\\u", "U", 0x016C}, + {"\\\\u", "u", 0x016D}, + {"\\\\r", "U", 0x016E}, + {"\\\\r", "u", 0x016F}, + /** 0x0170 */ + /** 0x0171 */ + /** 0x0172 */ + /** 0x0173 */ + /** 0x0174 */ + /** 0x0175 */ + /** 0x0176 */ + /** 0x0177 */ + {"\\\\\"", "Y", 0x0178}, + {"\\\\'", "Z", 0x0179}, + {"\\\\'", "z", 0x017A}, + /** 0x017B */ + /** 0x017C */ + {"\\\\v", "Z", 0x017D}, + {"\\\\v", "z", 0x017E}, + /** 0x017F */ + /** 0x0180 */ + {"\\\\v", "A", 0x01CD}, + {"\\\\v", "a", 0x01CE}, + {"\\\\v", "G", 0x01E6}, + {"\\\\v", "g", 0x01E7} + }; + + const char *expansionsMod1[] = {"\\{%1\\{%2\\}\\}", "\\{%1 %2\\}", "%1\\{%2\\}"}; + static const int expansionsmod1count = sizeof( expansionsMod1 ) / sizeof( expansionsMod1[0] ); + const char *expansionsMod2[] = {"\\{%1%2\\}", "%1%2\\{\\}", "%1%2"}; + static const int expansionsmod2count = sizeof( expansionsMod2 ) / sizeof( expansionsMod2[0] ); + + static const int modcharmappingdatalatexcount = sizeof( modcharmappingdatalatex ) / sizeof( modcharmappingdatalatex[ 0 ] ) ; + + static const struct EncoderLaTeXCharMapping + { + const char *regexp; + unsigned int unicode; + const char *latex; + } + charmappingdatalatex[] = + { + {"\\\\#", 0x0023, "\\#"}, + {"\\\\&", 0x0026, "\\&"}, + {"\\\\_", 0x005F, "\\_"}, + {"!`", 0x00A1, "!`"}, + {"\"<", 0x00AB, "\"<"}, + {"\">", 0x00BB, "\">"}, + {"[?]`", 0x00BF, "?`"}, + {"--", 0x2013, "--"} + }; + + static const int charmappingdatalatexcount = sizeof( charmappingdatalatex ) / sizeof( charmappingdatalatex[ 0 ] ) ; + + EncoderLaTeX::EncoderLaTeX() + { + buildCharMapping(); + buildCombinedMapping(); + } + + EncoderLaTeX::~EncoderLaTeX() + { + // nothing + } + + QString EncoderLaTeX::decode( const QString & text ) + { + const QString splitMarker = "|KBIBTEX|"; + + /** start-stop marker ensures that each text starts and stops + * with plain text and not with an inline math environment. + * This invariant is exploited implicitly in the code below. */ + const QString startStopMarker="|STARTSTOP|"; + QString result = startStopMarker + text + startStopMarker; + + /** Collect (all?) urls from the BibTeX file and store them in urls */ + /** Problem is that the replace function below will replace + * character sequences in the URL rendering the URL invalid. + * Later, all URLs will be replaced back to their original + * in the hope nothing breaks ... */ + QStringList urls; + QRegExp httpRegExp( "(ht|f)tp://[^\"} ]+" ); + httpRegExp.setMinimal( false ); + int pos = 0; + while ( pos >= 0 ) + { + pos = httpRegExp.search( result, pos ); + if ( pos >= 0 ) + { + ++pos; + QString url = httpRegExp.cap( 0 ); + urls << url; + } + } + + decomposedUTF8toLaTeX( result ); + + /** split text into math and non-math regions */ + QStringList intermediate = QStringList::split( '$', result, true ); + QStringList::Iterator it = intermediate.begin(); + while ( it != intermediate.end() ) + { + /** + * Sometimes we split strings like "\$", which is not intended. + * So, we have to manually fix things by checking for strings + * ending with "\" and append both the removed dollar sign and + * the following string (which was never supposed to be an + * independent string). Finally, we remove the unnecessary + * string and continue. + */ + if (( *it ).endsWith( "\\" ) ) + { + QStringList::Iterator cur = it; + ++it; + ( *cur ).append( '$' ).append( *it ); + intermediate.remove( it ); + it = cur; + } + else + ++it; + } + + qApp->processEvents(); + + result = ""; + for ( QStringList::Iterator it = intermediate.begin(); it != intermediate.end(); ++it ) + { + if ( !result.isEmpty() ) result.append( splitMarker ); + result.append( *it ); + + ++it; + if ( it == intermediate.end() ) + break; + + if (( *it ).length() > 256 ) + qDebug( "Very long math equation using $ found, maybe due to broken inline math: %s", ( *it ).left( 48 ).latin1() ); + } + + qApp->processEvents(); + + for ( QValueList<CharMappingItem>::ConstIterator cmit = m_charMapping.begin(); cmit != m_charMapping.end(); ++cmit ) + result.replace(( *cmit ).regExp, ( *cmit ).unicode ); + + qApp->processEvents(); + + QStringList transformed = QStringList::split( splitMarker, result, true ); + + qApp->processEvents(); + + result = ""; + for ( QStringList::Iterator itt = transformed.begin(), iti = intermediate.begin(); itt != transformed.end() && iti != intermediate.end(); ++itt, ++iti ) + { + result.append( *itt ); + + ++iti; + if ( iti == intermediate.end() ) + break; + + result.append( "$" ).append( *iti ).append( "$" ); + } + + qApp->processEvents(); + + /** Reinserting original URLs as explained above */ + pos = 0; + int idx = 0; + while ( pos >= 0 ) + { + pos = httpRegExp.search( result, pos ); + if ( pos >= 0 ) + { + ++pos; + int len = httpRegExp.cap( 0 ).length(); + result = result.left( pos - 1 ).append( urls[idx++] ).append( result.mid( pos + len - 1 ) ); + } + } + + return result.replace( startStopMarker,"" ); + } + + QString EncoderLaTeX::encode( const QString & text ) + { + const QString splitMarker = "|KBIBTEX|"; + + /** start-stop marker ensures that each text starts and stops + * with plain text and not with an inline math environment. + * This invariant is exploited implicitly in the code below. */ + const QString startStopMarker="|STARTSTOP|"; + QString result = startStopMarker + text + startStopMarker; + + /** Collect (all?) urls from the BibTeX file and store them in urls */ + /** Problem is that the replace function below will replace + * character sequences in the URL rendering the URL invalid. + * Later, all URLs will be replaced back to their original + * in the hope nothing breaks ... */ + QStringList urls; + QRegExp httpRegExp( "(ht|f)tp://[^\"} ]+" ); + httpRegExp.setMinimal( false ); + int pos = 0; + while ( pos >= 0 ) + { + pos = httpRegExp.search( result, pos ); + if ( pos >= 0 ) + { + ++pos; + QString url = httpRegExp.cap( 0 ); + urls << url; + } + } + + /** split text into math and non-math regions */ + QStringList intermediate = QStringList::split( '$', result, true ); + QStringList::Iterator it = intermediate.begin(); + while ( it != intermediate.end() ) + { + /** + * Sometimes we split strings like "\$", which is not intended. + * So, we have to manually fix things by checking for strings + * ending with "\" and append both the removed dollar sign and + * the following string (which was never supposed to be an + * independent string). Finally, we remove the unnecessary + * string and continue. + */ + if (( *it ).endsWith( "\\" ) ) + { + QStringList::Iterator cur = it; + ++it; + ( *cur ).append( '$' ).append( *it ); + intermediate.remove( it ); + it = cur; + } + else + ++it; + } + + qApp->processEvents(); + + result = ""; + for ( QStringList::Iterator it = intermediate.begin(); it != intermediate.end(); ++it ) + { + if ( !result.isEmpty() ) result.append( splitMarker ); + result.append( *it ); + + ++it; + if ( it == intermediate.end() ) + break; + + if (( *it ).length() > 256 ) + qDebug( "Very long math equation using $ found, maybe due to broken inline math: %s", ( *it ).left( 48 ).latin1() ); + } + + qApp->processEvents(); + + for ( QValueList<CharMappingItem>::ConstIterator cmit = m_charMapping.begin(); cmit != m_charMapping.end(); ++cmit ) + result.replace(( *cmit ).unicode, ( *cmit ).latex ); + + qApp->processEvents(); + + QStringList transformed = QStringList::split( splitMarker, result, true ); + + qApp->processEvents(); + + result = ""; + for ( QStringList::Iterator itt = transformed.begin(), iti = intermediate.begin(); itt != transformed.end() && iti != intermediate.end(); ++itt, ++iti ) + { + result.append( *itt ); + + ++iti; + if ( iti == intermediate.end() ) + break; + + result.append( "$" ).append( *iti ).append( "$" ); + } + + qApp->processEvents(); + + /** \url accepts unquotet & and _ + May introduce new problem tough */ + if ( result.contains( "\\url{" ) ) + result.replace( "\\&", "&" ).replace( "\\_", "_" ).replace( QChar( 0x2013 ), "--" ).replace( "\\#", "#" ); + + decomposedUTF8toLaTeX( result ); + + /** Reinserting original URLs as explained above */ + pos = 0; + int idx = 0; + while ( pos >= 0 ) + { + pos = httpRegExp.search( result, pos ); + if ( pos >= 0 ) + { + ++pos; + int len = httpRegExp.cap( 0 ).length(); + result = result.left( pos - 1 ).append( urls[idx++] ).append( result.mid( pos + len - 1 ) ); + } + } + + return result.replace( startStopMarker,"" ); + } + + QString EncoderLaTeX::encode( const QString &text, const QChar &replace ) + { + QString result = text; + for ( QValueList<CharMappingItem>::ConstIterator it = m_charMapping.begin(); it != m_charMapping.end(); ++it ) + if (( *it ).unicode == replace ) + result.replace(( *it ).unicode, ( *it ).latex ); + return result; + } + + QString EncoderLaTeX::encodeSpecialized( const QString & text, const EntryField::FieldType fieldType ) + { + QString result = encode( text ); + + switch ( fieldType ) + { + case EntryField::ftPages: + result.replace( QChar( 0x2013 ), "--" ); + break; + + case EntryField::ftURL: + result.replace( "\\&", "&" ).replace( "\\_", "_" ).replace( QChar( 0x2013 ), "--" ).replace( "\\#", "#" ); + break; + + default: + break; + } + + return result; + } + + QString& EncoderLaTeX::decomposedUTF8toLaTeX( QString &text ) + { + for ( QValueList<CombinedMappingItem>::Iterator it = m_combinedMapping.begin(); it != m_combinedMapping.end(); ++it ) + { + int i = ( *it ).regExp.search( text ); + while ( i >= 0 ) + { + QString a = ( *it ).regExp.cap( 1 ); + text = text.left( i ) + "\\" + ( *it ).latex + "{" + a + "}" + text.mid( i + 2 ); + i = ( *it ).regExp.search( text, i + 1 ); + } + } + + return text; + } + + void EncoderLaTeX::buildCombinedMapping() + { + for ( int i = 0; i < decompositionscount; i++ ) + { + CombinedMappingItem item; + item.regExp = QRegExp( "(.)" + QString( QChar( decompositions[i].unicode ) ) ); + item.latex = decompositions[i].latexCommand; + m_combinedMapping.append( item ); + } + } + + void EncoderLaTeX::buildCharMapping() + { + /** encoding and decoding for digraphs such as -- or ?` */ + for ( int i = 0; i < charmappingdatalatexcount; i++ ) + { + CharMappingItem charMappingItem; + charMappingItem.regExp = QRegExp( charmappingdatalatex[ i ].regexp ); + charMappingItem.unicode = QChar( charmappingdatalatex[ i ].unicode ); + charMappingItem.latex = QString( charmappingdatalatex[ i ].latex ); + m_charMapping.append( charMappingItem ); + } + + /** encoding and decoding for commands such as \AA or \ss */ + for ( int i = 0; i < commandmappingdatalatexcount; ++i ) + { + /** different types of writing such as {\AA} or \AA{} possible */ + for ( int j = 0; j < expansionscmdcount; ++j ) + { + CharMappingItem charMappingItem; + charMappingItem.regExp = QRegExp( QString( expansionsCmd[j] ).arg( commandmappingdatalatex[i].letters ) ); + charMappingItem.unicode = QChar( commandmappingdatalatex[i].unicode ); + if ( charMappingItem.regExp.numCaptures() > 0 ) + charMappingItem.unicode += QString( "\\1" ); + charMappingItem.latex = QString( "{\\%1}" ).arg( commandmappingdatalatex[i].letters ); + m_charMapping.append( charMappingItem ); + } + } + + /** encoding and decoding for letters such as \"a */ + for ( int i = 0; i < modcharmappingdatalatexcount; ++i ) + { + QString modifierRegExp = QString( modcharmappingdatalatex[i].modifier ); + QString modifier = modifierRegExp; + modifier.replace( "\\^", "^" ).replace( "\\\\", "\\" ); + + /** first batch of replacement rules, where no separator is required between modifier and character (e.g. \"a) */ + if ( !modifierRegExp.at( modifierRegExp.length() - 1 ).isLetter() ) + for ( int j = 0; j < expansionsmod2count; ++j ) + { + CharMappingItem charMappingItem; + charMappingItem.regExp = QRegExp( QString( expansionsMod2[j] ).arg( modifierRegExp ).arg( modcharmappingdatalatex[i].letter ) ); + charMappingItem.unicode = QChar( modcharmappingdatalatex[i].unicode ); + charMappingItem.latex = QString( "{%1%2}" ).arg( modifier ).arg( modcharmappingdatalatex[i].letter ); + m_charMapping.append( charMappingItem ); + } + + /** second batch of replacement rules, where a separator is required between modifier and character (e.g. \v{g}) */ + for ( int j = 0; j < expansionsmod1count; ++j ) + { + CharMappingItem charMappingItem; + charMappingItem.regExp = QRegExp( QString( expansionsMod1[j] ).arg( modifierRegExp ).arg( modcharmappingdatalatex[i].letter ) ); + charMappingItem.unicode = QChar( modcharmappingdatalatex[i].unicode ); + charMappingItem.latex = QString( "%1{%2}" ).arg( modifier ).arg( modcharmappingdatalatex[i].letter ); + m_charMapping.append( charMappingItem ); + } + } + } + + EncoderLaTeX* EncoderLaTeX::currentEncoderLaTeX() + { + if ( encoderLaTeX == NULL ) + encoderLaTeX = new EncoderLaTeX(); + + return encoderLaTeX; + } + + void EncoderLaTeX::deleteCurrentEncoderLaTeX() + { + if ( encoderLaTeX != NULL ) + { + delete encoderLaTeX; + encoderLaTeX = NULL; + } + } + + char EncoderLaTeX::unicodeToASCII( unsigned int unicode ) + { + if ( unicode < 128 ) return ( char )unicode; + for ( int i = 0; i < modcharmappingdatalatexcount; ++i ) + if ( modcharmappingdatalatex[i].unicode == unicode ) + return *modcharmappingdatalatex[i].letter; + return '?'; + } + +} |