diff options
Diffstat (limited to 'kpdf/xpdf/xpdf/Lexer.cc')
-rw-r--r-- | kpdf/xpdf/xpdf/Lexer.cc | 521 |
1 files changed, 521 insertions, 0 deletions
diff --git a/kpdf/xpdf/xpdf/Lexer.cc b/kpdf/xpdf/xpdf/Lexer.cc new file mode 100644 index 00000000..1ef37175 --- /dev/null +++ b/kpdf/xpdf/xpdf/Lexer.cc @@ -0,0 +1,521 @@ +//======================================================================== +// +// Lexer.cc +// +// Copyright 1996-2003 Glyph & Cog, LLC +// +//======================================================================== + +#include <aconf.h> + +#ifdef USE_GCC_PRAGMAS +#pragma implementation +#endif + +#include <stdlib.h> +#include <stddef.h> +#include <string.h> +#include <ctype.h> +#include "Lexer.h" +#include "Error.h" +#include "XRef.h" + +//------------------------------------------------------------------------ + +// A '1' in this array means the character is white space. A '1' or +// '2' means the character ends a name or command. +static char specialChars[256] = { + 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, // 0x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x + 1, 0, 0, 0, 0, 2, 0, 0, 2, 2, 0, 0, 0, 0, 0, 2, // 2x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, // 3x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 4x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, 0, // 5x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 6x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, 0, // 7x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // ax + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // bx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // cx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // dx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // ex + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // fx +}; + +//------------------------------------------------------------------------ +// Lexer +//------------------------------------------------------------------------ + +Lexer::Lexer(XRef *xrefA, Stream *str) { + Object obj; + + xref = xrefA; + + curStr.initStream(str); + streams = new Array(xref); + streams->add(curStr.copy(&obj)); + strPtr = 0; + freeArray = gTrue; + curStr.streamReset(); +} + +Lexer::Lexer(XRef *xrefA, Object *obj) { + Object obj2; + + xref = xrefA; + + if (obj->isStream()) { + streams = new Array(xref); + freeArray = gTrue; + streams->add(obj->copy(&obj2)); + } else { + streams = obj->getArray(); + freeArray = gFalse; + } + strPtr = 0; + if (streams->getLength() > 0) { + streams->get(strPtr, &curStr); + curStr.streamReset(); + } +} + +Lexer::~Lexer() { + if (!curStr.isNone()) { + curStr.streamClose(); + curStr.free(); + } + if (freeArray) { + delete streams; + } +} + +int Lexer::getChar() { + int c; + + c = EOF; + while (!curStr.isNone() && (c = curStr.streamGetChar()) == EOF) { + curStr.streamClose(); + curStr.free(); + ++strPtr; + if (strPtr < streams->getLength()) { + streams->get(strPtr, &curStr); + curStr.streamReset(); + } + } + return c; +} + +int Lexer::lookChar() { + if (curStr.isNone()) { + return EOF; + } + return curStr.streamLookChar(); +} + +Object *Lexer::getObj(Object *obj, int objNum) { + char *p; + int c, c2; + GBool comment, neg, done; + int numParen; + int xi; + double xf, scale; + GString *s; + int n, m; + + // skip whitespace and comments + comment = gFalse; + while (1) { + if ((c = getChar()) == EOF) { + return obj->initEOF(); + } + if (comment) { + if (c == '\r' || c == '\n') + comment = gFalse; + } else if (c == '%') { + comment = gTrue; + } else if (specialChars[c] != 1) { + break; + } + } + + // start reading token + switch (c) { + + // number + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + case '-': case '.': + neg = gFalse; + xi = 0; + if (c == '-') { + neg = gTrue; + } else if (c == '.') { + goto doReal; + } else { + xi = c - '0'; + } + while (1) { + c = lookChar(); + if (isdigit(c)) { + getChar(); + xi = xi * 10 + (c - '0'); + } else if (c == '.') { + getChar(); + goto doReal; + } else { + break; + } + } + if (neg) + xi = -xi; + obj->initInt(xi); + break; + doReal: + xf = xi; + scale = 0.1; + while (1) { + c = lookChar(); + if (c == '-') { + // ignore minus signs in the middle of numbers to match + // Adobe's behavior + error(getPos(), "Badly formatted number"); + getChar(); + continue; + } + if (!isdigit(c)) { + break; + } + getChar(); + xf = xf + scale * (c - '0'); + scale *= 0.1; + } + if (neg) + xf = -xf; + obj->initReal(xf); + break; + + // string + case '(': + p = tokBuf; + n = 0; + numParen = 1; + done = gFalse; + s = NULL; + do { + c2 = EOF; + switch (c = getChar()) { + + case EOF: +#if 0 + // This breaks some PDF files, e.g., ones from Photoshop. + case '\r': + case '\n': +#endif + error(getPos(), "Unterminated string"); + done = gTrue; + break; + + case '(': + ++numParen; + c2 = c; + break; + + case ')': + if (--numParen == 0) { + done = gTrue; + } else { + c2 = c; + } + break; + + case '\\': + switch (c = getChar()) { + case 'n': + c2 = '\n'; + break; + case 'r': + c2 = '\r'; + break; + case 't': + c2 = '\t'; + break; + case 'b': + c2 = '\b'; + break; + case 'f': + c2 = '\f'; + break; + case '\\': + case '(': + case ')': + c2 = c; + break; + case '0': case '1': case '2': case '3': + case '4': case '5': case '6': case '7': + c2 = c - '0'; + c = lookChar(); + if (c >= '0' && c <= '7') { + getChar(); + c2 = (c2 << 3) + (c - '0'); + c = lookChar(); + if (c >= '0' && c <= '7') { + getChar(); + c2 = (c2 << 3) + (c - '0'); + } + } + break; + case '\r': + c = lookChar(); + if (c == '\n') { + getChar(); + } + break; + case '\n': + break; + case EOF: + error(getPos(), "Unterminated string"); + done = gTrue; + break; + default: + c2 = c; + break; + } + break; + + default: + c2 = c; + break; + } + + if (c2 != EOF) { + if (n == tokBufSize) { + if (!s) + s = new GString(tokBuf, tokBufSize); + else + s->append(tokBuf, tokBufSize); + p = tokBuf; + n = 0; + + // we are growing see if the document is not malformed and we are growing too much + if (objNum > 0 && xref != NULL) + { + int newObjNum = xref->getNumEntry(curStr.streamGetPos()); + if (newObjNum != objNum) + { + error(getPos(), "Unterminated string"); + done = gTrue; + delete s; + n = -2; + } + } + } + *p++ = (char)c2; + ++n; + } + } while (!done); + if (n >= 0) { + if (!s) + s = new GString(tokBuf, n); + else + s->append(tokBuf, n); + obj->initString(s); + } else { + obj->initEOF(); + } + break; + + // name + case '/': + p = tokBuf; + n = 0; + s = NULL; + while ((c = lookChar()) != EOF && !specialChars[c]) { + getChar(); + if (c == '#') { + c2 = lookChar(); + if (c2 >= '0' && c2 <= '9') { + c = c2 - '0'; + } else if (c2 >= 'A' && c2 <= 'F') { + c = c2 - 'A' + 10; + } else if (c2 >= 'a' && c2 <= 'f') { + c = c2 - 'a' + 10; + } else { + goto notEscChar; + } + getChar(); + c <<= 4; + c2 = getChar(); + if (c2 >= '0' && c2 <= '9') { + c += c2 - '0'; + } else if (c2 >= 'A' && c2 <= 'F') { + c += c2 - 'A' + 10; + } else if (c2 >= 'a' && c2 <= 'f') { + c += c2 - 'a' + 10; + } else { + error(getPos(), "Illegal digit in hex char in name"); + } + } + notEscChar: + if (n == tokBufSize) { + if (!s) + s = new GString(tokBuf, tokBufSize); + else + { + // the spec says 127 is the maximum, we are already at 256 so bail out + error(getPos(), "Name token too long"); + break; + } + p = tokBuf; + n = 0; + } + *p++ = c; + ++n; + } + *p = '\0'; + if (s) { + s->append(tokBuf, n); + obj->initName(s->getCString()); + delete s; + } else obj->initName(tokBuf); + break; + + // array punctuation + case '[': + case ']': + tokBuf[0] = c; + tokBuf[1] = '\0'; + obj->initCmd(tokBuf); + break; + + // hex string or dict punctuation + case '<': + c = lookChar(); + + // dict punctuation + if (c == '<') { + getChar(); + tokBuf[0] = tokBuf[1] = '<'; + tokBuf[2] = '\0'; + obj->initCmd(tokBuf); + + // hex string + } else { + p = tokBuf; + m = n = 0; + c2 = 0; + s = NULL; + while (1) { + c = getChar(); + if (c == '>') { + break; + } else if (c == EOF) { + error(getPos(), "Unterminated hex string"); + break; + } else if (specialChars[c] != 1) { + c2 = c2 << 4; + if (c >= '0' && c <= '9') + c2 += c - '0'; + else if (c >= 'A' && c <= 'F') + c2 += c - 'A' + 10; + else if (c >= 'a' && c <= 'f') + c2 += c - 'a' + 10; + else + error(getPos(), "Illegal character <%02x> in hex string", c); + if (++m == 2) { + if (n == tokBufSize) { + if (!s) + s = new GString(tokBuf, tokBufSize); + else + s->append(tokBuf, tokBufSize); + p = tokBuf; + n = 0; + } + *p++ = (char)c2; + ++n; + c2 = 0; + m = 0; + } + } + } + if (!s) + s = new GString(tokBuf, n); + else + s->append(tokBuf, n); + if (m == 1) + s->append((char)(c2 << 4)); + obj->initString(s); + } + break; + + // dict punctuation + case '>': + c = lookChar(); + if (c == '>') { + getChar(); + tokBuf[0] = tokBuf[1] = '>'; + tokBuf[2] = '\0'; + obj->initCmd(tokBuf); + } else { + error(getPos(), "Illegal character '>'"); + obj->initError(); + } + break; + + // error + case ')': + case '{': + case '}': + error(getPos(), "Illegal character '%c'", c); + obj->initError(); + break; + + // command + default: + p = tokBuf; + *p++ = c; + n = 1; + while ((c = lookChar()) != EOF && !specialChars[c]) { + getChar(); + if (++n == tokBufSize) { + error(getPos(), "Command token too long"); + break; + } + *p++ = c; + } + *p = '\0'; + if (tokBuf[0] == 't' && !strcmp(tokBuf, "true")) { + obj->initBool(gTrue); + } else if (tokBuf[0] == 'f' && !strcmp(tokBuf, "false")) { + obj->initBool(gFalse); + } else if (tokBuf[0] == 'n' && !strcmp(tokBuf, "null")) { + obj->initNull(); + } else { + obj->initCmd(tokBuf); + } + break; + } + + return obj; +} + +void Lexer::skipToNextLine() { + int c; + + while (1) { + c = getChar(); + if (c == EOF || c == '\n') { + return; + } + if (c == '\r') { + if ((c = lookChar()) == '\n') { + getChar(); + } + return; + } + } +} + +GBool Lexer::isSpace(int c) { + return c >= 0 && c <= 0xff && specialChars[c] == 1; +} |