diff options
author | toma <toma@283d02a7-25f6-0310-bc7c-ecb5cbfe19da> | 2009-11-25 17:56:58 +0000 |
---|---|---|
committer | toma <toma@283d02a7-25f6-0310-bc7c-ecb5cbfe19da> | 2009-11-25 17:56:58 +0000 |
commit | bd9e6617827818fd043452c08c606f07b78014a0 (patch) | |
tree | 425bb4c3168f9c02f10150f235d2cb998dcc6108 /poxml/parser.cpp | |
download | tdesdk-bd9e6617827818fd043452c08c606f07b78014a0.tar.gz tdesdk-bd9e6617827818fd043452c08c606f07b78014a0.zip |
Copy the KDE 3.5 branch to branches/trinity for new KDE 3.5 features.
BUG:215923
git-svn-id: svn://anonsvn.kde.org/home/kde/branches/trinity/kdesdk@1054174 283d02a7-25f6-0310-bc7c-ecb5cbfe19da
Diffstat (limited to 'poxml/parser.cpp')
-rw-r--r-- | poxml/parser.cpp | 1008 |
1 files changed, 1008 insertions, 0 deletions
diff --git a/poxml/parser.cpp b/poxml/parser.cpp new file mode 100644 index 00000000..c34976bf --- /dev/null +++ b/poxml/parser.cpp @@ -0,0 +1,1008 @@ +// #define POXML_DEBUG + +#include "parser.h" +#include <iostream> +#include <stdlib.h> +#include <assert.h> +#include <qregexp.h> + +using namespace std; + +static const char *singletags[] = {"beginpage","imagedata", "colspec", "spanspec", + "anchor", "xref", "area", + "footnoteref", "void", "inlinegraphic", + "glosssee", "graphic", "xi:include", + 0}; +static const char *cuttingtags[] = {"bridgehead", "trans_comment", "para", "title", "term", + "entry", "contrib", "keyword", "example", + "note", "footnote", "caution", + "informalexample", "remark", "comment", + "imageobject", "varlistentry", "thead", + "tbody", "tgroup", "row", "screenshot", "screeninfo", + "variablelist", "step", "procedure", + "step", "holder", "listitem", "important", + "author", "itemizedlist", "orderedlist", + "caption", "textobject", "mediaobject", + "tip", "glossdef", "inlinemediaobject", + "simplelist", "member", "glossentry", + "areaspec", "corpauthor", "indexterm", + "calloutlist", "callout", "subtitle", + "table", "part", "xi:fallback", "primary", + "secondary", "chapter", "sect1", "sect2", + "figure", "abstract", "sect3", "sect", "sect4", + "warning", "preface", "authorgroup", "keywordset", + "informaltable", "qandaentry", "question", "answer", + "othercredit", "affiliation", "qandaset", + "cmdsynopsis", "funcsynopsis", "funcsynopsisinfo" , + "epigraph", "attribution", "glossary", "chapterinfo", + "glossdiv", "blockingquote", "simplesect", "section", + "qandadiv", "refsect1", "refmeta", "formalpara", + "refentry", "refnamediv", "refpurpose", "refentrytitle", + "refmiscinfo", "refsect2", "refsect3", "refsect1info", + "refsect2info", "refsect3info", "refsection", "refsectioninfo", + "refsynopsisdiv", "refsysnopsisdivinfo", "remark", + "revdescription", "glossentry", "partinfo", + "segmentedlist", "segtitle", "seg", "seglistitem", "screenco", + 0}; +static const char *literaltags[] = {"literallayout", "synopsis", "screen", + "programlisting", 0}; + +bool StructureParser::fatalError ( const QXmlParseException &e ) +{ + cerr << "fatalError " << e.message().latin1() << " " << e.lineNumber() << " " + << e.columnNumber() << endl; + return false; +} + +bool StructureParser::startDocument() +{ + infos_reg = QRegExp("\\s*poxml_line=\"(\\d+)\" poxml_col=\"(\\d+)\""); + do_not_split_reg = QRegExp("\\s*condition=\"do-not-split\""); + message = ""; + inside = 0; + return true; +} + +bool StructureParser::isCuttingTag(const QString &qName) +{ + int index = 0; + while (cuttingtags[index]) { + if (cuttingtags[index] == qName) + return true; + index++; + } + return isLiteralTag(qName); +} + +bool StructureParser::isSingleTag(const QString &qName) +{ + int index = 0; + while (singletags[index]) { + if (singletags[index] == qName) + return true; + index++; + } + return false; +} + +bool StructureParser::isLiteralTag(const QString &qName) +{ + int index = 0; + while (literaltags[index]) { + if (literaltags[index] == qName) + return true; + index++; + } + return false; +} + +bool StructureParser::skippedEntity ( const QString & name ) +{ + if (inside) + message += QString("&%1;").arg(name); + return true; +} + +bool StructureParser::startElement( const QString& , const QString& , + const QString& qName, + const QXmlAttributes & attr ) +{ + QString tname = qName.lower(); + + bool first = false; + + if (isCuttingTag(tname)) { + if (!inside) { + message = QString::null; + list.pc.increasePara(); + startline = locator->lineNumber(); + startcol = locator->columnNumber(); + first = true; + } + inside++; + } + + if (inside) + { + QString tmp = "<" + tname; + for (int i = 0; i < attr.length(); i++) { + tmp += QString(" %1=\"%2\"").arg(attr.qName(i)).arg(attr.value(i)); + } + tmp += QString(" poxml_line=\"%1\"").arg(locator->lineNumber()); + tmp += QString(" poxml_col=\"%1\"").arg(locator->columnNumber()); + + if (isSingleTag(qName)) + tmp += "/>"; + else + tmp += ">"; + message += tmp; + if (first) + startcol -= message.length(); + } + + if (tname == "anchor" || tname.left(4) == "sect" || tname == "chapter") + if (!attr.value("id").isEmpty()) list.pc.addAnchor(attr.value("id")); + + return true; +} + +bool StructureParser::startCDATA() +{ + if ( inside ) + message += "<![CDATA["; + return true; +} + +bool StructureParser::endCDATA() +{ + if ( inside ) + message += "]]>"; + return true; +} + +bool StructureParser::isClosure(const QString &message) +{ + assert(message.at(0) == '<'); + int endindex = 1; + while (!message.at(endindex).isSpace() && message.at(endindex) != '>') + endindex++; + QString tag = message.mid(1, endindex - 1); + return closureTag(message, tag); +} + +bool StructureParser::closureTag(const QString& message, const QString &tag) +{ +#ifdef POXML_DEBUG + qDebug("closureTag %s %s", message.latin1(), tag.latin1()); +#endif + + int inside = 0; + uint index = 0; + while (true) + { + int nextclose = message.find(QRegExp(QString::fromLatin1("</%1[\\s>]").arg(tag)), index); + int nextstart = message.find(QRegExp(QString::fromLatin1("<%1[>\\s]").arg(tag)), index); + // qDebug("finding %d %d %d %d", nextstart, nextclose, index, inside); + if (nextclose == -1) { +#ifdef POXML_DEBUG + qDebug("ending on no close anymore %d %d %d %d", (!inside && index >= message.length()), inside, index, message.length()); +#endif + return !inside && index >= message.length(); + } + if (nextstart == -1) + nextstart = message.length() + 1; + + if (nextstart < nextclose) { + inside++; + index = nextstart + 1; + while (message.at(index) != '>') + index++; + index++; + } else { + inside--; + index = nextclose + 1; + while (message.at(index) != '>') + index++; + index++; + if (!inside) { +#ifdef POXML_DEBUG + qDebug("ending on exit %d", index >= message.length()); +#endif + return index >= message.length(); + } + } + } +} + +void StructureParser::descape(QString &message) +{ + uint index = 0; + stripWhiteSpace( message ); + + int inside = 0; + bool lastws = false; + + while (index < message.length()) { + switch (message.at(index).latin1()) { + case '\n': + case '\t': + case '\r': + if (!inside) + message[index] = ' '; + case ' ': + if (!inside && lastws) + message[index] = '\010'; + lastws = true; + break; + case '<': { + uint endindex = index+1; + while (endindex < message.length() && !message.at(endindex).isSpace() && + message.at(endindex) != '>') + endindex++; + QString tag = message.mid(index + 1, endindex - index - 1); + if (tag.at(0) == '/') { + if (isLiteralTag(tag.mid(1))) + inside--; + } else + if (isLiteralTag(tag)) + inside++; + break; + } + default: + lastws = false; + } + + index++; + } + message.replace(QRegExp("\010"), ""); +} + +bool StructureParser::formatMessage(MsgBlock &msg) const +{ +#ifdef POXML_DEBUG + qDebug("formatMessage %s", msg.msgid.latin1()); +#endif + + int offset = 0; + bool changed = false; + bool recurse = true; + + if (msg.msgid.isEmpty()) + return true; + + for (int index = 0; msg.msgid.at(index) == ' '; index++, offset++); + stripWhiteSpace( msg.msgid ); + + // removing starting single tags + for (int index = 0; singletags[index]; index++) + { + int slen = strlen(singletags[index]); + + if (msg.msgid.left(slen + 1) == QString::fromLatin1("<%1").arg(singletags[index]) && + !msg.msgid.at( slen + 1 ).isLetterOrNumber() ) + { +#ifdef POXML_DEBUG + qDebug("removing single tag %s", singletags[index]); +#endif + int strindex = strlen(singletags[index]) + 1; + while (msg.msgid.at(strindex) != '>') + strindex++; + msg.msgid = msg.msgid.mid(strindex + 1); + changed = true; + offset += strindex + 1; + for (int index = 0; msg.msgid.at(index) == ' '; index++, offset++) ; + stripWhiteSpace( msg.msgid ); + } + } + + while (msg.msgid.right(2) == "/>") + { + int strindex = msg.msgid.length() - 2; + while (msg.msgid.at(strindex) != '<') + strindex--; + msg.msgid = msg.msgid.left(strindex); + stripWhiteSpace( msg.msgid ); // only removed space at the end + changed = true; + } + + for (int index = 0; msg.msgid.at(index) == ' '; index++, offset++) ; + stripWhiteSpace( msg.msgid ); + + while (true) { + if (msg.msgid.at(0) != '<') + break; + if (msg.msgid.at(msg.msgid.length() - 1) != '>') + break; + int strindex = 1; + while (msg.msgid.at(strindex) != ' ' && msg.msgid.at(strindex) != '>') + strindex++; + QString starttag = msg.msgid.mid(1, strindex - 1); + int endindex = msg.msgid.length() - 2; + while (msg.msgid.at(endindex) != '<' && msg.msgid.at(endindex + 1) != '/') + endindex--; +#ifdef POXML_DEBUG + qDebug("endIndex %d", endindex); +#endif + strindex = endindex; + QString orig = msg.msgid; + + QString endtag = msg.msgid.mid(endindex + 2, msg.msgid.length() - (endindex + 2) - 1); + QString endtag_attr = endtag.mid(endtag.find(' '), endtag.length()); + endtag.replace(infos_reg, ""); + if (endtag == starttag) { + if (!closureTag(msg.msgid, starttag)) + break; + + // removing start/end tags + msg.msgid = msg.msgid.left(endindex); + strindex = 0; + while (msg.msgid.at(strindex) != '>') + strindex++; + QString attr = msg.msgid.left(strindex); + msg.msgid = msg.msgid.mid(strindex + 1); + offset += strindex + 1; + for (int index = 0; msg.msgid.at(index) == ' '; index++, offset++) ; + stripWhiteSpace( msg.msgid ); + msg.tag = starttag; + + if (infos_reg.search(attr) >= 0) { + msg.lines.first().start_line = infos_reg.cap(1).toInt(); + msg.lines.first().start_col = infos_reg.cap(2).toInt(); +#ifdef POXML_DEBUG + qDebug("col %s %s %d", attr.latin1(), msg.msgid.latin1(), msg.lines.first().start_col); +#endif + offset = 0; + + if (infos_reg.search(endtag_attr) >= 0) { + msg.lines.first().end_line = infos_reg.cap(1).toInt(); + msg.lines.first().end_col = infos_reg.cap(2).toInt() + 1; + } + } + if (do_not_split_reg.search(attr) >= 0) { + msg.do_not_split = true; + break; + } + + changed = true; + } else + break; + } + +#ifdef POXML_DEBUG + qDebug("formatMessage result %s %d %d", msg.msgid.latin1(), changed && recurse, msg.lines.first().start_col); +#endif + + msg.lines.first().offset += offset; + if (msg.do_not_split) + recurse = false; + + if (changed && recurse) + formatMessage(msg); + + return !recurse; // indicates an abort +} + +MsgList StructureParser::splitMessage(const MsgBlock &mb) +{ + MsgList result; + + MsgBlock msg1 = mb; + MsgBlock msg2 = mb; + + QString message = mb.msgid; + +#ifdef POXML_DEBUG + qDebug("splitMessage %s", message.latin1()); +#endif + + if (message.at(0) == '<') { + int endindex = 1; + while (!message.at(endindex).isSpace() && message.at(endindex) != '>') + endindex++; + QString tag = message.mid(1, endindex - 1); + + if (closureTag(message, tag)) + goto error; + + if (isCuttingTag(tag)) + { + // if the message starts with a cutting tag, this tag has to + // end in between. We split both messages and format them + int strindex = endindex; + strindex++; + + int inside = 1; + while (true) { +#ifdef POXML_DEBUG + qDebug("inside %s %d", message.mid(strindex, 35).latin1(), inside); +#endif + + // the exception for poxml_* attributes is made in the closing tag + int closing_index = message.find(QRegExp(QString::fromLatin1("</%1[\\s>]").arg(tag)), + strindex); + int starting_index = message.find(QRegExp(QString::fromLatin1("<%1[\\s>]").arg(tag)), + strindex); + +#ifdef POXML_DEBUG + qDebug("index1 %d %d %d", closing_index, starting_index, strindex); +#endif + + // when a new start was found, we set the start_index after the next match + // (and set strindex to it later - increasing inside) + if (starting_index != -1) { + starting_index += tag.length() + 1; + while (message.at(starting_index) != '>') + starting_index++; + starting_index++; + } + +#ifdef POXML_DEBUG + qDebug("index %d %d %d", closing_index, starting_index, strindex); +#endif + + assert(closing_index != -1); + closing_index += 3 + tag.length(); + while (message.at(closing_index - 1) != '>') + closing_index++; + + if (starting_index == -1) { + strindex = closing_index; +#ifdef POXML_DEBUG + qDebug("set strindex %d", strindex); +#endif + inside--; + if (!inside) + break; + continue; + } + if (closing_index < starting_index) + { + strindex = closing_index; + inside--; + } else { + strindex = starting_index; + inside++; + } + + if (!inside) + break; + } + +#ifdef POXML_DEBUG + qDebug("split into %s -AAAAAANNNNNNDDDDDD- %s", message.left(strindex).latin1(), message.mid(strindex).latin1()); +#endif + msg1.msgid = message.left(strindex); + bool leave = formatMessage(msg1); + + msg2.msgid = message.mid(strindex); + msg2.lines.first().offset += strindex; + leave = leave & formatMessage(msg2); + + if (msg1.lines.first().end_line > msg2.lines.first().start_line || + (msg1.lines.first().end_line == msg2.lines.first().start_line && + msg1.lines.first().end_col > msg2.lines.first().start_col)) + { + msg2.lines.first().start_line = msg1.lines.first().end_line; + msg2.lines.first().start_col = msg1.lines.first().end_col; + } + +#ifdef POXML_DEBUG + qDebug("splited %d-%d(%s) and %d-%d(%s)", msg1.lines.first().end_line,msg1.lines.first().end_col, + msg1.msgid.latin1(), + msg2.lines.first().start_line,msg2.lines.first().start_col, msg2.msgid.latin1()); +#endif + + if (leave) { + result.append(msg1); + result.append(msg2); + return result; + } + result = splitMessage(msg1); + result += splitMessage(msg2); + return result; + } + + } + + if (message.at(message.length() - 1 ) == '>') + { + int endindex = message.length() - 1; + while (endindex >= 0 && (message.at(endindex) != '<' || message.at(endindex + 1) != '/')) + endindex--; + QString tag = message.mid(endindex + 2, message.length() - endindex - 3); + if (tag.find(' ') > 0 ) { + tag = tag.left(tag.find(' ')); + } +#ifdef POXML_DEBUG + qDebug("behind tag %s", tag.latin1()); +#endif + + if (isCuttingTag(tag)) + { + // if the message ends with a cutting tag, this tag has to + // start in between. We split both messages and format them + int strindex = endindex; + + int inside = 1; + while (true) { +#ifdef POXML_DEBUG + qDebug("inside %s %d", message.mid(strindex, 35).latin1(), inside); +#endif + + int closing_index = message.findRev(QRegExp(QString::fromLatin1("</%1[\\s>]").arg(tag)), + strindex - 1); + int starting_index = message.findRev(QRegExp(QString::fromLatin1("<%1[\\s>]").arg(tag)), + strindex - 1); + +#ifdef POXML_DEBUG + qDebug("index1 %d %d %d", closing_index, starting_index, strindex); +#endif + + if (starting_index == -1) { + assert(inside == 1); + break; + } + + if (closing_index > starting_index) + { + strindex = closing_index; + inside++; + } else { + strindex = starting_index; + inside--; + } + + if (!inside) + break; + } + + +#ifdef POXML_DEBUG + qDebug("split2 into \"%s\" -AAAAAANNNNNNNNNDDDDDDDDDDD- \"%s\"", message.left(strindex).latin1(), message.mid(strindex).latin1()); +#endif + + msg1.msgid = message.left(strindex); + formatMessage(msg1); + + msg2.msgid = message.mid(strindex); + msg2.lines.first().offset += strindex; + formatMessage(msg2); + + if (msg1.lines.first().end_line > msg2.lines.first().start_line || + (msg1.lines.first().end_line == msg2.lines.first().start_line && + msg1.lines.first().end_col > msg2.lines.first().start_col)) + { + msg1.lines.first().end_line = msg2.lines.first().start_line; + msg1.lines.first().end_col = msg2.lines.first().start_col - 1; + } + +#ifdef POXML_DEBUG + qDebug("splited %d-%d(%s) and %d-%d(%s)", msg1.lines.first().end_line,msg1.lines.first().end_col, + msg1.msgid.latin1(), + msg2.lines.first().start_line,msg2.lines.first().start_col, msg2.msgid.latin1()); +#endif + + result = splitMessage(msg1); + result += splitMessage(msg2); + + return result; + } + } +error: + result.append(mb); + return result; +} + +bool StructureParser::endElement( const QString& , const QString&, const QString& qName) +{ + QString tname = qName.lower(); + + // qDebug("endElement %s - %s %d", tname.latin1(), message.latin1(), inside); + + if (inside) { + if (!isSingleTag(qName)) { + message += QString("</%1").arg(tname); + message += QString(" poxml_line=\"%1\"").arg(locator->lineNumber()); + message += QString(" poxml_col=\"%1\"").arg(locator->columnNumber()); + message += ">"; + } + } + + if (isCuttingTag(tname)) { + inside--; + if (!inside) { + MsgBlock m; + descape(message); + m.msgid = message; + + BlockInfo bi; + bi.start_line = startline; + bi.start_col = startcol; + bi.end_line = locator->lineNumber(); + bi.end_col = locator->columnNumber() + 1; + bi.offset = m.lines.first().offset; + m.lines.append(bi); + formatMessage(m); + + MsgList messages = splitMessage(m); + for (MsgList::Iterator it = messages.begin(); + it != messages.end(); ++it) + { +#ifdef POXML_DEBUG + qDebug("parser '%s' %d '%s' %d:%d", (*it).msgid.latin1(), (*it).lines.first().offset, message.mid((*it).lines.first().offset, 15).latin1(), (*it).lines.first().start_line, (*it).lines.first().start_col); +#endif + // if the remaining text still starts with a tag, the poxml_ info + // is most probably more correct + if ((*it).msgid.at(0) == '<' && isClosure((*it).msgid)) { + if (infos_reg.search((*it).msgid) >= 0) { + (*it).lines.first().start_line = infos_reg.cap(1).toInt(); + (*it).lines.first().start_col = infos_reg.cap(2).toInt();; + (*it).lines.first().offset = 0; + } + } + (*it).msgid.replace(infos_reg, QString::null); + + if (!(*it).msgid.isEmpty()) + list.append(*it); + } + } + } + + return true; +} + +bool StructureParser::comment ( const QString &c ) +{ + if (c.left(7) != " TRANS:") + return true; + + assert(false); + return true; +} + +QString StructureParser::escapeLiterals( const QString &_contents) { + QString contents = _contents; + + contents.replace(QRegExp("\n"), "&POXML_LINEFEED;"); + contents.replace(QRegExp("<"), "&POXML_LT;"); + contents.replace(QRegExp(">"), "&POXML_GT;"); + contents.replace(QRegExp("\t"), " "); + contents.replace(QRegExp(" "), "&POXML_SPACE;"); + + return contents; +} + +QString StructureParser::descapeLiterals( const QString &_contents) { + QString contents = _contents; + + contents.replace(QRegExp("&POXML_LINEFEED;"), "\n"); + contents.replace(QRegExp("&POXML_LT;"), "<"); + contents.replace(QRegExp("&POXML_GT;"), ">"); + contents.replace(QRegExp("&POXML_SPACE;"), " "); + contents.replace(QRegExp("!POXML_AMP!"), "&"); + return contents; +} + +void StructureParser::stripWhiteSpace( QString &contents) +{ + contents = contents.stripWhiteSpace(); + bool changed; + do { + changed = false; + if (contents.startsWith("&POXML_LINEFEED;")) { + contents = contents.mid(strlen("&POXML_LINEFEED;"), contents.length()); + changed = true; + } + if (contents.startsWith("&POXML_SPACE;")) { + contents = contents.mid(strlen("&POXML_SPACE;"), contents.length()); + changed = true; + } + if (contents.endsWith("&POXML_LINEFEED;")) { + contents = contents.left(contents.length() - strlen("&POXML_LINEFEED;")); + changed = true; + } + if (contents.endsWith("&POXML_SPACE;")) { + contents = contents.left( contents.length() - strlen("&POXML_SPACE;")); + changed = true; + } + } while (changed); +} + +void StructureParser::cleanupTags( QString &contents ) +{ + contents.replace(QRegExp("&"), "!POXML_AMP!"); + + for (int index = 0; literaltags[index]; index++) { + QRegExp start(QString("<%1[\\s>]").arg(literaltags[index])); + QRegExp end(QString("</%1[\\s>]").arg(literaltags[index])); + int strindex = 0; + while (true) { + strindex = contents.find(start, strindex); + if (strindex < 0) + break; + while (contents.at(strindex) != '>') + strindex++; + strindex++; // one more + int endindex = contents.find(end, strindex); + QString part = contents.mid(strindex, endindex - strindex); + QString newpart = escapeLiterals(part); + contents.replace(strindex, part.length(), newpart); + // this assumes that literal tags to not overlap + strindex = strindex + newpart.length(); + } + } + + QRegExp unclosed("</(\\w*)\\s\\s*>"); + int index = -1; + while (true) { + index = unclosed.search(contents, index + 1); + if (index < 0) + break; + QString tag = unclosed.cap(1); + contents.replace(index, unclosed.matchedLength(), QString("</%1>").arg(tag)); + } + + QRegExp start("<((\\s*[^<>\\s])*)\\s\\s*(/*)>"); + start.setMinimal(true); + + index = -1; + while (true) { + index = start.search(contents, index + 1); + if (index < 0) + break; + QString tag = start.cap(1); + QString cut = start.capturedTexts().last(); + // qDebug("UNCLO %s %d -%s- -%s-", start.cap(0).latin1(), index, tag.latin1(), cut.latin1()); + contents.replace(index, start.matchedLength(), QString("<%1%2>").arg(tag).arg(cut)); + } + QRegExp singletag("<(\\w*)\\s([^><]*)/>"); + + index = -1; + while (true) { + index = singletag.search(contents, index + 1); + if (index < 0) + break; + QString tag = singletag.cap(1); + if (!StructureParser::isSingleTag(tag)) { + contents.replace(index, singletag.matchedLength(), QString("<%1 %2></%3>").arg(tag).arg(singletag.cap(2)).arg(tag)); + } + } + + QRegExp trans_comment("<!-- TRANS:([^<>]*)-->"); + index = -1; + while (true) { + index = trans_comment.search(contents, index + 1); + if (index < 0) + break; + QString msgid = trans_comment.cap(1); + contents.replace(index, trans_comment.matchedLength(), QString("<trans_comment>%1</trans_comment>").arg(msgid)); + } + +#ifdef POXML_DEBUG + qDebug("final %s", contents.latin1()); +#endif + +} + +static bool removeEmptyTag( QString &contents, const QString & tag) +{ +// qDebug("cont %s %s", contents.latin1(), tag.latin1()); + + QRegExp empty(QString("<%1[^>]*>[\\s\n][\\s\n]*</%2\\s*>").arg(tag).arg(tag)); + int strindex = 0; + while (true) { + strindex = contents.find(empty, strindex); + if (strindex < 0) + break; + qDebug("found empty tag %s", tag.latin1()); + contents.replace(strindex, empty.matchedLength(), " "); + strindex++; + return true; + } + return false; +} + +void StructureParser::removeEmptyTags( QString &contents ) +{ + bool removed; + do { + removed = false; + + for (int index = 0; cuttingtags[index]; index++) { + if (removeEmptyTag(contents, cuttingtags[index])) { + removed = true; + break; + } + } + // as glossterm has two different semantics, it's likely + // to break something when it's cuttingtag + if (removeEmptyTag(contents, "glossterm")) + removed = true; + + } while (removed); +} + +bool StructureParser::characters(const QString &ch) +{ + if (inside && !ch.isEmpty()) + message += ch; + return true; +} + +QString escape(QString message) +{ + message.replace(QRegExp("\\\\"), "\\\\"); + message.replace(QRegExp("\""), "\\\""); + return message; +} + +void outputMsg(const char *prefix, const QString &message) +{ + QStringList list = QStringList::split('\n', message, true); + QString line; + + if (list.count() == 1) { + line = list.first(); + if (line.isEmpty()) + cout << prefix << " \"\"\n"; + else + cout << prefix << " \"" << escape(line).utf8().data() << "\"\n"; + } else { + cout << prefix << " \"\"\n"; + for (QStringList::ConstIterator it = list.begin(); it != list.end(); it++) { + line = *it; + if (!line.isEmpty()) { + cout << " \"" << escape(line).utf8().data(); + if (it == list.fromLast()) + cout << "\"\n"; + else + cout << "\\n\"\n"; + } else { + cout << " \""; + if (it != list.fromLast()) + cout << "\\n"; + cout << "\"\n"; + } + } + } +} + +QString escapePO(QString msgid) +{ + int index = 0; + while (true) { + index = msgid.find("\\n", index); + if (index == -1) + break; + if (index >= 1 && msgid.at(index - 1) == '\\' && msgid.at(index - 2) != '\\') { + msgid.replace(index - 1, 3, "&POXML_LITERALLINEFEED;"); + index += 3; + } else + msgid.replace(index, 2, "\n"); + } + index = 0; + while (true) { + index = msgid.find("\\\"", index); + if (index == -1) + break; + if (index > 1 && msgid.at(index - 1) == '\\' && msgid.at(index - 2) != '\\') + msgid.replace(index - 1, 3, "&POXML_LITERALQUOTE;"); + else + msgid.replace(index, 2, "\""); + } + index = 0; + while (true) { + index = msgid.find("\\t", index); + if (index == -1) + break; + if (msgid.at(index - 1) == '\\') + msgid.replace(index - 1, 3, "\\t"); + else + msgid.replace(index, 2, "\t"); + } + index = 0; + while (true) { + index = msgid.find("\\\\", index); + if (index == -1) + break; + msgid.replace(index, 2, "\\"); + index += 1; + } + + msgid.replace(QRegExp("&POXML_LITERALLINEFEED;"), "\\n"); + msgid.replace(QRegExp("&POXML_LITERALQUOTE;"), "\\"); + return msgid; +} + + +MsgList parseXML(const char *filename) +{ + StructureParser handler; + QFile xmlFile( filename ); + xmlFile.open(IO_ReadOnly); + + QCString ccontents; + ccontents.fill(0, xmlFile.size() + 1); + memcpy(ccontents.data(), xmlFile.readAll().data(), xmlFile.size()); + xmlFile.close(); + + QString contents = QString::fromUtf8( ccontents ); + StructureParser::cleanupTags(contents); + + while (true) { + int index = contents.find("<!ENTITY"); + if (index < 0) + break; + int inside = 0; + int endindex = index + 1; + QString replacement = ""; + while (contents.at(endindex) != '>' || inside) + { + switch (contents.at(endindex).latin1()) { + case '<': + inside++; break; + case '>': + inside--; break; + case '\n': + replacement += '\n'; + break; + default: + break; + } + endindex++; + } + endindex++; + contents.replace(index, endindex - index, replacement); + } + + QTextStream ts(contents.utf8(), IO_ReadOnly); + QXmlInputSource source( ts ); + QXmlSimpleReader reader; + reader.setFeature( "http://trolltech.com/xml/features/report-start-end-entity", true); + reader.setContentHandler( &handler ); + reader.setLexicalHandler( &handler ); + reader.setDTDHandler( &handler ); + // reader.setErrorHandler( &handler ); + reader.parse( source ); + MsgList english = handler.getList(); + + bool changed = false; + + do { + changed = false; + QMap<QString, QString> msgids; + + for (MsgList::Iterator it = english.begin(); + it != english.end(); it++) + { + QMap<QString,QString>::Iterator found = msgids.find((*it).msgid); + if ((*it).msgid.length() < 4) { + (*it).msgid = QString("<%1>").arg((*it).tag) + (*it).msgid + + QString("</%1>").arg((*it).tag); + changed = true; + break; + } + if (found != msgids.end()) { + if (found.data() != (*it).tag) { +#ifdef POXML_DEBUG + qDebug("same msgid for '%s' and '%s'", found.data().latin1(), (*it).tag.latin1()); +#endif + changed = true; + QString msgid = (*it).msgid; + for (MsgList::Iterator it2 = english.begin(); + it2 != english.end(); it2++) + { + if ((*it2).msgid == msgid) + (*it2).msgid = QString("<%1>").arg((*it2).tag) + msgid + QString("</%1>").arg((*it2).tag); + } + break; + } + } else { + msgids.insert((*it).msgid, (*it).tag); + } + } + } while (changed); + + return english; +} + |