diff options
Diffstat (limited to 'debian/uncrustify-trinity/uncrustify-trinity-0.78.1/src/tokenize.cpp')
-rw-r--r-- | debian/uncrustify-trinity/uncrustify-trinity-0.78.1/src/tokenize.cpp | 2946 |
1 files changed, 2946 insertions, 0 deletions
diff --git a/debian/uncrustify-trinity/uncrustify-trinity-0.78.1/src/tokenize.cpp b/debian/uncrustify-trinity/uncrustify-trinity-0.78.1/src/tokenize.cpp new file mode 100644 index 00000000..846755fe --- /dev/null +++ b/debian/uncrustify-trinity/uncrustify-trinity-0.78.1/src/tokenize.cpp @@ -0,0 +1,2946 @@ +/** + * @file tokenize.cpp + * This file breaks up the text stream into tokens or chunks. + * + * Each routine needs to set pc.len and pc.type. + * + * @author Ben Gardner + * @license GPL v2+ + */ + +#include "tokenize.h" + +#include "keywords.h" +#include "prototypes.h" +#include "punctuators.h" +#include "unc_ctype.h" + +#include <regex> +#include <stack> + + +#define LE_COUNT(x) cpd.le_counts[static_cast<size_t>(LE_ ## x)] + +constexpr static auto LCURRENT = LTOK; + +using namespace std; +using namespace uncrustify; + + +struct TokenInfo +{ + TokenInfo() + : last_ch(0) + , idx(0) + , row(1) + , col(1) + { + } + + size_t last_ch; + size_t idx; + size_t row; + size_t col; +}; + + +struct TokenContext +{ + TokenContext(const deque<int> &d) + : data(d) + { + } + + + //! save before trying to parse something that may fail + void save() + { + save(s); + } + + + void save(TokenInfo &info) + { + info = c; + } + + + //! restore previous saved state + void restore() + { + restore(s); + } + + + void restore(const TokenInfo &info) + { + c = info; + } + + + bool more() + { + return(c.idx < data.size()); + } + + + size_t peek() + { + return(more() ? data[c.idx] : 0); + } + + + size_t peek(size_t idx) + { + idx += c.idx; + return((idx < data.size()) ? data[idx] : 0); + } + + + size_t get() + { + if (more()) + { + size_t ch = data[c.idx++]; + + switch (ch) + { + case '\t': + log_rule_B("input_tab_size"); + c.col = calc_next_tab_column(c.col, options::input_tab_size()); + break; + + case '\n': + + if (c.last_ch != '\r') + { + c.row++; + c.col = 1; + } + break; + + case '\r': + c.row++; + c.col = 1; + break; + + default: + c.col++; + break; + } + c.last_ch = ch; + return(ch); + } + return(0); + } + + + bool expect(size_t ch) + { + if (peek() == ch) + { + get(); + return(true); + } + return(false); + } + + + const deque<int> &data; + TokenInfo c; //! current + TokenInfo s; //! saved +}; + + +/** + * Count the number of characters in a quoted string. + * The next bit of text starts with a quote char " or ' or <. + * Count the number of characters until the matching character. + * + * @param pc The structure to update, str is an input. + * + * @return Whether a string was parsed + */ +static bool parse_string(TokenContext &ctx, Chunk &pc, size_t quote_idx, bool allow_escape); + + +/** + * Literal string, ends with single " + * Two "" don't end the string. + * + * @param pc The structure to update, str is an input. + * + * @return Whether a string was parsed + */ +static bool parse_cs_string(TokenContext &ctx, Chunk &pc); + + +/** + * VALA verbatim string, ends with three quotes (""") + * + * @param pc The structure to update, str is an input. + */ +static void parse_verbatim_string(TokenContext &ctx, Chunk &pc); + + +static bool tag_compare(const deque<int> &d, size_t a_idx, size_t b_idx, size_t len); + + +/** + * Parses a C++0x 'R' string. R"( xxx )" R"tag( )tag" u8R"(x)" uR"(x)" + * Newlines may be in the string. + * + * @param pc structure to update, str is an input. + */ +static bool parse_cr_string(TokenContext &ctx, Chunk &pc, size_t q_idx); + + +/** + * Count the number of whitespace characters. + * + * @param pc The structure to update, str is an input. + * + * @return Whether whitespace was parsed + */ +static bool parse_whitespace(TokenContext &ctx, Chunk &pc); + + +/** + * Called when we hit a backslash. + * If there is nothing but whitespace until the newline, then this is a + * backslash newline + * + * @param pc structure to update, str is an input + */ +static bool parse_bs_newline(TokenContext &ctx, Chunk &pc); + + +/** + * Parses any number of tab or space chars followed by a newline. + * Does not change pc.len if a newline isn't found. + * This is not the same as parse_whitespace() because it only consumes until + * a single newline is encountered. + */ +static bool parse_newline(TokenContext &ctx); + + +/** + * PAWN #define is different than C/C++. + * #define PATTERN REPLACEMENT_TEXT + * The PATTERN may not contain a space or '[' or ']'. + * A generic whitespace check should be good enough. + * Do not change the pattern. + * + * @param pc structure to update, str is an input + */ +static void parse_pawn_pattern(TokenContext &ctx, Chunk &pc, E_Token tt); + + +static bool parse_ignored(TokenContext &ctx, Chunk &pc); + + +/** + * Skips the next bit of whatever and returns the type of block. + * + * pc.str is the input text. + * pc.len in the output length. + * pc.type is the output type + * pc.column is output column + * + * @param pc The structure to update, str is an input. + * @param prev_pc The previous structure + * + * @return true/false - whether anything was parsed + */ +static bool parse_next(TokenContext &ctx, Chunk &pc, const Chunk *prev_pc); + + +/** + * Parses all legal D string constants. + * + * Quoted strings: + * r"Wysiwyg" # WYSIWYG string + * x"hexstring" # Hexadecimal array + * `Wysiwyg` # WYSIWYG string + * 'char' # single character + * "reg_string" # regular string + * + * Non-quoted strings: + * \x12 # 1-byte hex constant + * \u1234 # 2-byte hex constant + * \U12345678 # 4-byte hex constant + * \123 # octal constant + * \& # named entity + * \n # single character + * + * @param pc The structure to update, str is an input. + * + * @return Whether a string was parsed + */ +static bool d_parse_string(TokenContext &ctx, Chunk &pc); + + +/** + * Figure of the length of the comment at text. + * The next bit of text starts with a '/', so it might be a comment. + * There are three types of comments: + * - C comments that start with '/ *' and end with '* /' + * - C++ comments that start with // + * - D nestable comments '/+' '+/' + * + * @param pc The structure to update, str is an input. + * + * @return Whether a comment was parsed + */ +static bool parse_comment(TokenContext &ctx, Chunk &pc); + + +/** + * Figure of the length of the code placeholder at text, if present. + * This is only for Xcode which sometimes inserts temporary code placeholder chunks, which in plaintext <#look like this#>. + * + * @param pc The structure to update, str is an input. + * + * @return Whether a placeholder was parsed. + */ +static bool parse_code_placeholder(TokenContext &ctx, Chunk &pc); + + +/** + * Parse any attached suffix, which may be a user-defined literal suffix. + * If for a string, explicitly exclude common format and scan specifiers, ie, + * PRIx32 and SCNx64. + */ +static void parse_suffix(TokenContext &ctx, Chunk &pc, bool forstring); + + +//! check if a symbol holds a boolean value +static bool is_bin(int ch); +static bool is_bin_(int ch); + + +//! check if a symbol holds a octal value +static bool is_oct(int ch); +static bool is_oct_(int ch); + + +//! check if a symbol holds a decimal value; +static bool is_dec(int ch); +static bool is_dec_(int ch); + + +//! check if a symbol holds a hexadecimal value +static bool is_hex(int ch); +static bool is_hex_(int ch); + + +/** + * Count the number of characters in the number. + * The next bit of text starts with a number (0-9 or '.'), so it is a number. + * Count the number of characters in the number. + * + * This should cover all number formats for all languages. + * Note that this is not a strict parser. It will happily parse numbers in + * an invalid format. + * + * For example, only D allows underscores in the numbers, but they are + * allowed in all formats. + * + * @param[in,out] pc The structure to update, str is an input. + * + * @return Whether a number was parsed + */ +static bool parse_number(TokenContext &ctx, Chunk &pc); + + +static bool d_parse_string(TokenContext &ctx, Chunk &pc) +{ + size_t ch = ctx.peek(); + + if ( ch == '"' // 34 + || ch == '\'') // 39 + { + return(parse_string(ctx, pc, 0, true)); + } + + if (ch == '`') // 96 + { + return(parse_string(ctx, pc, 0, false)); + } + + if ( ( ch == 'r' // 114 + || ch == 'x') // 120 + && ctx.peek(1) == '"') // 34 + { + return(parse_string(ctx, pc, 1, false)); + } + + if (ch != '\\') + { + return(false); + } + ctx.save(); + int cnt; + + pc.Str().clear(); + + while (ctx.peek() == '\\') // 92 + { + pc.Str().append(ctx.get()); + + // Check for end of file + switch (ctx.peek()) + { + case 'x': // \x HexDigit HexDigit + cnt = 3; + + while (cnt--) + { + pc.Str().append(ctx.get()); + } + break; + + case 'u': // \u HexDigit (x4) + cnt = 5; + + while (cnt--) + { + pc.Str().append(ctx.get()); + } + break; + + case 'U': // \U HexDigit (x8) + cnt = 9; + + while (cnt--) + { + pc.Str().append(ctx.get()); + } + break; + + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + // handle up to 3 octal digits + pc.Str().append(ctx.get()); + ch = ctx.peek(); + + if ( (ch >= '0') + && (ch <= '7')) + { + pc.Str().append(ctx.get()); + ch = ctx.peek(); + + if ( (ch >= '0') + && (ch <= '7')) + { + pc.Str().append(ctx.get()); + } + } + break; + + case '&': + // \& NamedCharacterEntity ; + pc.Str().append(ctx.get()); + + while (unc_isalpha(ctx.peek())) + { + pc.Str().append(ctx.get()); + } + + if (ctx.peek() == ';') // 59 + { + pc.Str().append(ctx.get()); + } + break; + + default: + // Everything else is a single character + pc.Str().append(ctx.get()); + break; + } // switch + } + + if (pc.GetStr().size() < 1) + { + ctx.restore(); + return(false); + } + pc.SetType(CT_STRING); + return(true); +} // d_parse_string + + +#if 0 + + +//! A string-in-string search. Like strstr() with a haystack length. +static const char *str_search(const char *needle, const char *haystack, int haystack_len) +{ + int needle_len = strlen(needle); + + while (haystack_len-- >= needle_len) + { + if (memcmp(needle, haystack, needle_len) == 0) + { + return(haystack); + } + haystack++; + } + return(NULL); +} +#endif + + +static bool parse_comment(TokenContext &ctx, Chunk &pc) +{ + bool is_d = language_is_set(LANG_D); + bool is_cs = language_is_set(LANG_CS); + size_t d_level = 0; + + // does this start with '/ /' or '/ *' or '/ +' (d) + if ( (ctx.peek() != '/') + || ( (ctx.peek(1) != '*') + && (ctx.peek(1) != '/') + && ( (ctx.peek(1) != '+') + || !is_d))) + { + return(false); + } + ctx.save(); + + // account for opening two chars + pc.Str() = ctx.get(); // opening '/' + size_t ch = ctx.get(); + + pc.Str().append(ch); // second char + + if (ch == '/') // 47 + { + pc.SetType(CT_COMMENT_CPP); + + while (true) + { + int bs_cnt = 0; + + while (ctx.more()) + { + ch = ctx.peek(); + + if ( (ch == '\r') + || (ch == '\n')) + { + break; + } + + if ( (ch == '\\') // 92 + && !is_cs) // backslashes aren't special in comments in C# + { + bs_cnt++; + } + else + { + bs_cnt = 0; + } + pc.Str().append(ctx.get()); + } + + /* + * If we hit an odd number of backslashes right before the newline, + * then we keep going. + */ + if ( ((bs_cnt & 1) == 0) + || !ctx.more()) + { + break; + } + + if (ctx.peek() == '\r') + { + pc.Str().append(ctx.get()); + } + + if (ctx.peek() == '\n') + { + pc.Str().append(ctx.get()); + } + pc.SetNlCount(pc.GetNlCount() + 1); + cpd.did_newline = true; + } + } + else if (!ctx.more()) + { + // unexpected end of file + ctx.restore(); + return(false); + } + else if (ch == '+') // 43 + { + pc.SetType(CT_COMMENT); + d_level++; + + while ( d_level > 0 + && ctx.more()) + { + if ( (ctx.peek() == '+') // 43 + && (ctx.peek(1) == '/')) // 47 + { + pc.Str().append(ctx.get()); // store the '+' + pc.Str().append(ctx.get()); // store the '/' + d_level--; + continue; + } + + if ( (ctx.peek() == '/') // 47 + && (ctx.peek(1) == '+')) // 43 + { + pc.Str().append(ctx.get()); // store the '/' + pc.Str().append(ctx.get()); // store the '+' + d_level++; + continue; + } + ch = ctx.get(); + pc.Str().append(ch); + + if ( (ch == '\n') + || (ch == '\r')) + { + pc.SetType(CT_COMMENT_MULTI); + pc.SetNlCount(pc.GetNlCount() + 1); + + if (ch == '\r') + { + if (ctx.peek() == '\n') + { + ++LE_COUNT(CRLF); + pc.Str().append(ctx.get()); // store the '\n' + } + else + { + ++LE_COUNT(CR); + } + } + else + { + ++LE_COUNT(LF); + } + } + } + } + else // must be '/ *' + { + pc.SetType(CT_COMMENT); + + while (ctx.more()) + { + if ( (ctx.peek() == '*') // 43 + && (ctx.peek(1) == '/')) // 47 + { + pc.Str().append(ctx.get()); // store the '*' + pc.Str().append(ctx.get()); // store the '/' + + TokenInfo ss; + ctx.save(ss); + size_t oldsize = pc.GetStr().size(); + + // If there is another C comment right after this one, combine them + while ( (ctx.peek() == ' ') // 32 + || (ctx.peek() == '\t')) // tab + { + pc.Str().append(ctx.get()); + } + + if ( (ctx.peek() != '/') + || (ctx.peek(1) != '*')) + { + // undo the attempt to join + ctx.restore(ss); + pc.Str().resize(oldsize); + break; + } + } + ch = ctx.get(); + pc.Str().append(ch); + + if ( (ch == '\n') + || (ch == '\r')) + { + pc.SetType(CT_COMMENT_MULTI); + pc.SetNlCount(pc.GetNlCount() + 1); + + if (ch == '\r') + { + if (ctx.peek() == '\n') + { + ++LE_COUNT(CRLF); + pc.Str().append(ctx.get()); // store the '\n' + } + else + { + ++LE_COUNT(CR); + } + } + else + { + ++LE_COUNT(LF); + } + } + } + } + + if (cpd.unc_off) + { + bool found_enable_marker = (find_enable_processing_comment_marker(pc.GetStr()) >= 0); + + if (found_enable_marker) + { + const auto &ontext = options::enable_processing_cmt(); + + LOG_FMT(LBCTRL, "%s(%d): Found '%s' on line %zu\n", + __func__, __LINE__, ontext.c_str(), pc.GetOrigLine()); + cpd.unc_off = false; + } + } + else + { + auto position_disable_processing_cmt = find_disable_processing_comment_marker(pc.GetStr()); + bool found_disable_marker = (position_disable_processing_cmt >= 0); + + if (found_disable_marker) + { + /** + * the user may wish to disable processing part of a multiline comment, + * in which case we'll handle at a late time. Check to see if processing + * is re-enabled elsewhere in this comment + */ + auto position_enable_processing_cmt = find_enable_processing_comment_marker(pc.GetStr()); + + if (position_enable_processing_cmt < position_disable_processing_cmt) + { + const auto &offtext = options::disable_processing_cmt(); + + LOG_FMT(LBCTRL, "%s(%d): Found '%s' on line %zu\n", + __func__, __LINE__, offtext.c_str(), pc.GetOrigLine()); + cpd.unc_off = true; + // Issue #842 + cpd.unc_off_used = true; + } + } + } + return(true); +} // parse_comment + + +static bool parse_code_placeholder(TokenContext &ctx, Chunk &pc) +{ + if ( (ctx.peek() != '<') + || (ctx.peek(1) != '#')) + { + return(false); + } + ctx.save(); + + // account for opening two chars '<#' + pc.Str() = ctx.get(); + pc.Str().append(ctx.get()); + + // grab everything until '#>', fail if not found. + size_t last1 = 0; + + while (ctx.more()) + { + size_t last2 = last1; + last1 = ctx.get(); + pc.Str().append(last1); + + if ( (last2 == '#') // 35 + && (last1 == '>')) // 62 + { + pc.SetType(CT_WORD); + return(true); + } + } + ctx.restore(); + return(false); +} + + +static void parse_suffix(TokenContext &ctx, Chunk &pc, bool forstring = false) +{ + if (CharTable::IsKw1(ctx.peek())) + { + size_t slen = 0; + size_t oldsize = pc.GetStr().size(); + + // don't add the suffix if we see L" or L' or S" + size_t p1 = ctx.peek(); + size_t p2 = ctx.peek(1); + + if ( forstring + && ( ( (p1 == 'L') // 76 + && ( (p2 == '"') // 34 + || (p2 == '\''))) // 39 + || ( (p1 == 'S') // 83 + && (p2 == '"')))) // 34 + { + return; + } + TokenInfo ss; + ctx.save(ss); + + while ( ctx.more() + && CharTable::IsKw2(ctx.peek())) + { + slen++; + pc.Str().append(ctx.get()); + } + + if ( forstring + && slen >= 4 + && ( pc.GetStr().startswith("PRI", oldsize) + || pc.GetStr().startswith("SCN", oldsize))) + { + ctx.restore(ss); + pc.Str().resize(oldsize); + } + } +} // parse_suffix + + +static bool is_bin(int ch) +{ + return( (ch == '0') // 48 + || (ch == '1')); // 49 +} + + +static bool is_bin_(int ch) +{ + return( is_bin(ch) + || ch == '_' // 95 + || ch == '\''); // 39 +} + + +static bool is_oct(int ch) +{ + return( (ch >= '0') // 48 + && (ch <= '7')); // 55 +} + + +static bool is_oct_(int ch) +{ + return( is_oct(ch) + || ch == '_' // 95 + || ch == '\''); // 39 +} + + +static bool is_dec(int ch) +{ + return( (ch >= '0') // 48 + && (ch <= '9')); // 57 +} + + +static bool is_dec_(int ch) +{ + // number separators: JAVA: "_", C++14: "'" + return( is_dec(ch) + || (ch == '_') // 95 + || (ch == '\'')); // 39 +} + + +static bool is_hex(int ch) +{ + return( ( (ch >= '0') // 48 + && (ch <= '9')) // 57 + || ( (ch >= 'a') // 97 + && (ch <= 'f')) // 102 + || ( (ch >= 'A') // 65 + && (ch <= 'F'))); // 70 +} + + +static bool is_hex_(int ch) +{ + return( is_hex(ch) + || ch == '_' // 95 + || ch == '\''); // 39 +} + + +static bool parse_number(TokenContext &ctx, Chunk &pc) +{ + /* + * A number must start with a digit or a dot, followed by a digit + * (signs handled elsewhere) + */ + if ( !is_dec(ctx.peek()) + && ( (ctx.peek() != '.') // 46 + || !is_dec(ctx.peek(1)))) + { + return(false); + } + bool is_float = (ctx.peek() == '.'); // 46 + + if ( is_float + && (ctx.peek(1) == '.')) // make sure it isn't '..' 46 + { + return(false); + } + /* + * Check for Hex, Octal, or Binary + * Note that only D, C++14 and Pawn support binary + * Fixes the issue # 1591 + * In c# the numbers starting with 0 are not treated as octal numbers. + */ + bool did_hex = false; + + if ( ctx.peek() == '0' // 48 + && !language_is_set(LANG_CS)) + { + size_t ch; + Chunk pc_temp; + + pc.Str().append(ctx.get()); // store the '0' + pc_temp.Str().append('0'); + + // MS constant might have an "h" at the end. Look for it + ctx.save(); + + while ( ctx.more() + && CharTable::IsKw2(ctx.peek())) + { + ch = ctx.get(); + pc_temp.Str().append(ch); + } + ch = pc_temp.GetStr()[pc_temp.Len() - 1]; + ctx.restore(); + LOG_FMT(LBCTRL, "%s(%d): pc_temp:%s\n", __func__, __LINE__, pc_temp.Text()); + + if (ch == 'h') // TODO can we combine this in analyze_character 104 + { + // we have an MS hexadecimal number with "h" at the end + LOG_FMT(LBCTRL, "%s(%d): MS hexadecimal number\n", __func__, __LINE__); + did_hex = true; + + do + { + pc.Str().append(ctx.get()); // store the rest + } while (is_hex_(ctx.peek())); + + pc.Str().append(ctx.get()); // store the h + LOG_FMT(LBCTRL, "%s(%d): pc:%s\n", __func__, __LINE__, pc.Text()); + } + else + { + switch (unc_toupper(ctx.peek())) + { + case 'X': // hex + did_hex = true; + + do + { + pc.Str().append(ctx.get()); // store the 'x' and then the rest + } while (is_hex_(ctx.peek())); + + break; + + case 'B': // binary + + do + { + pc.Str().append(ctx.get()); // store the 'b' and then the rest + } while (is_bin_(ctx.peek())); + + break; + + case '0': // octal or decimal + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + + do + { + pc.Str().append(ctx.get()); + } while (is_oct_(ctx.peek())); + + break; + + default: + // either just 0 or 0.1 or 0UL, etc + break; + } // switch + } + } + else + { + // Regular int or float + while (is_dec_(ctx.peek())) + { + pc.Str().append(ctx.get()); + } + } + + // Check if we stopped on a decimal point & make sure it isn't '..' + if ( (ctx.peek() == '.') // 46 + && (ctx.peek(1) != '.')) // 46 + { + // Issue #1265, 5.clamp() + TokenInfo ss; + ctx.save(ss); + + while ( ctx.more() + && CharTable::IsKw2(ctx.peek(1))) + { + // skip characters to check for paren open + ctx.get(); + } + + if (ctx.peek(1) == '(') // 40 + { + ctx.restore(ss); + pc.SetType(CT_NUMBER); + return(true); + } + else + { + ctx.restore(ss); + } + pc.Str().append(ctx.get()); + is_float = true; + + if (did_hex) + { + while (is_hex_(ctx.peek())) + { + pc.Str().append(ctx.get()); + } + } + else + { + while (is_dec_(ctx.peek())) + { + pc.Str().append(ctx.get()); + } + } + } + /* + * Check exponent + * Valid exponents per language (not that it matters): + * C/C++/D/Java: eEpP + * C#/Pawn: eE + */ + size_t tmp = unc_toupper(ctx.peek()); + + if ( (tmp == 'E') // 69 + || (tmp == 'P')) // 80 + { + is_float = true; + pc.Str().append(ctx.get()); + + if ( (ctx.peek() == '+') // 43 + || (ctx.peek() == '-')) // 45 + { + pc.Str().append(ctx.get()); + } + + while (is_dec_(ctx.peek())) + { + pc.Str().append(ctx.get()); + } + } + + /* + * Check the suffixes + * Valid suffixes per language (not that it matters): + * Integer Float + * C/C++: uUlL64 lLfF + * C#: uUlL fFdDMm + * D: uUL ifFL + * Java: lL fFdD + * Pawn: (none) (none) + * + * Note that i, f, d, and m only appear in floats. + */ + while (1) + { + size_t tmp2 = unc_toupper(ctx.peek()); + + // https://en.cppreference.com/w/cpp/language/floating_literal + if ( (tmp2 == 'I') // 73 + || (tmp2 == 'F') // 70 + || (tmp2 == 'D') // 68 + || (tmp2 == 'M')) // 77 + { + // is a decimal point found? Issue #4027 + const char *test_it = pc.Text(); + size_t test_long = strlen(test_it); + bool point_found = false; + + for (size_t ind = 0; ind < test_long; ind++) + { + if (test_it[ind] == '.') + { + point_found = true; + break; + } + } + + if (point_found) + { + is_float = true; + } + else + { + // append the char(s) until is not IsKw2 + while (ctx.more()) + { + size_t ch = ctx.peek(); + + if (CharTable::IsKw2(ch)) + { + pc.Str().append(ctx.get()); + } + else + { + break; + } + } + pc.SetType(CT_WORD); + return(true); + } + } + else if ( (tmp2 != 'L') // 76 + && (tmp2 != 'U')) // 85 + { + break; + } + pc.Str().append(ctx.get()); + } + + // skip the Microsoft-specific '8' suffix + if ((ctx.peek() == '8')) // 56 + { + pc.Str().append(ctx.get()); + } + + // skip the Microsoft-specific '16', '32' and '64' suffix + if ( ( (ctx.peek() == '1') // 49 + && (ctx.peek(1) == '6')) // 54 + || ( (ctx.peek() == '3') // 51 + && (ctx.peek(1) == '2')) // 50 + || ( (ctx.peek() == '6') // 54 + && (ctx.peek(1) == '4'))) // 52 + { + pc.Str().append(ctx.get()); + pc.Str().append(ctx.get()); + } + + // skip the Microsoft-specific '128' suffix + if (( (ctx.peek() == '1') // 49 + && (ctx.peek(1) == '2') // 50 + && (ctx.peek(2) == '8'))) // 56 + { + pc.Str().append(ctx.get()); + pc.Str().append(ctx.get()); + pc.Str().append(ctx.get()); + } + pc.SetType(is_float ? CT_NUMBER_FP : CT_NUMBER); + + /* + * If there is anything left, then we are probably dealing with garbage or + * some sick macro junk. Eat it. + */ + parse_suffix(ctx, pc); + + return(true); +} // parse_number + + +static bool parse_string(TokenContext &ctx, Chunk &pc, size_t quote_idx, bool allow_escape) +{ + log_rule_B("string_escape_char"); + const size_t escape_char = options::string_escape_char(); + + log_rule_B("string_escape_char2"); + const size_t escape_char2 = options::string_escape_char2(); + + log_rule_B("string_replace_tab_chars"); + const bool should_escape_tabs = ( allow_escape + && options::string_replace_tab_chars() + && language_is_set(LANG_ALLC)); + + pc.Str().clear(); + + while (quote_idx-- > 0) + { + pc.Str().append(ctx.get()); + } + pc.SetType(CT_STRING); + const size_t termination_character = CharTable::Get(ctx.peek()) & 0xff; + + pc.Str().append(ctx.get()); // store the " + + bool escaped = false; + + while (ctx.more()) + { + const size_t ch = ctx.get(); + + // convert char 9 (\t) to chars \t + if ( (ch == '\t') + && should_escape_tabs) + { + const size_t lastcol = ctx.c.col - 1; + ctx.c.col = lastcol + 2; + pc.Str().append(escape_char); + pc.Str().append('t'); + continue; + } + pc.Str().append(ch); + + if (ch == '\n') + { + pc.SetNlCount(pc.GetNlCount() + 1); + pc.SetType(CT_STRING_MULTI); + } + else if ( ch == '\r' + && ctx.peek() != '\n') + { + pc.Str().append(ctx.get()); + pc.SetNlCount(pc.GetNlCount() + 1); + pc.SetType(CT_STRING_MULTI); + } + + // if last char in prev loop was escaped the one in the current loop isn't + if (escaped) + { + escaped = false; + continue; + } + + // see if the current char is a escape char + if (allow_escape) + { + if (ch == escape_char) + { + escaped = (escape_char != 0); + continue; + } + + if ( ch == escape_char2 + && (ctx.peek() == termination_character)) + { + escaped = allow_escape; + continue; + } + } + + if (ch == termination_character) + { + break; + } + } + parse_suffix(ctx, pc, true); + return(true); +} // parse_string + +enum cs_string_t +{ + CS_STRING_NONE = 0, + CS_STRING_STRING = 1 << 0, // is any kind of string + CS_STRING_VERBATIM = 1 << 1, // @"" style string + CS_STRING_INTERPOLATED = 1 << 2, // $"" or $@"" style string +}; + +static cs_string_t operator|=(cs_string_t &value, cs_string_t other) +{ + return(value = static_cast<cs_string_t>(value | other)); +} + + +static cs_string_t parse_cs_string_start(TokenContext &ctx, Chunk &pc) +{ + cs_string_t stringType = CS_STRING_NONE; + int offset = 0; + + if (ctx.peek(offset) == '$') // 36 + { + stringType |= CS_STRING_INTERPOLATED; + ++offset; + } + + if (ctx.peek(offset) == '@') // 64 + { + stringType |= CS_STRING_VERBATIM; + ++offset; + } + + if (ctx.peek(offset) == '"') // 34 + { + stringType |= CS_STRING_STRING; + + pc.SetType(CT_STRING); + + for (int i = 0; i <= offset; ++i) + { + pc.Str().append(ctx.get()); + } + } + else + { + stringType = CS_STRING_NONE; + } + return(stringType); +} // parse_cs_string_start + + +struct CsStringParseState +{ + cs_string_t type; + int braceDepth; + + + CsStringParseState(cs_string_t stringType) + { + type = stringType; + braceDepth = 0; + } +}; + + +/** + * C# strings are complex enough (mostly due to interpolation and nesting) that they need a custom parser. + */ +static bool parse_cs_string(TokenContext &ctx, Chunk &pc) +{ + cs_string_t stringType = parse_cs_string_start(ctx, pc); + + if (stringType == CS_STRING_NONE) + { + return(false); + } + // an interpolated string can contain {expressions}, which can contain $"strings", which in turn + // can contain {expressions}, so we must track both as they are interleaved, in order to properly + // parse the outermost string. + + std::stack<CsStringParseState> parseState; // each entry is a nested string + + parseState.push(CsStringParseState(stringType)); + + log_rule_B("string_replace_tab_chars"); + bool should_escape_tabs = options::string_replace_tab_chars(); + + while (ctx.more()) + { + if (parseState.top().braceDepth > 0) + { + // all we can do when in an expr is look for expr close with }, or a new string opening. must do this first + // so we can peek and potentially consume chars for new string openings, before the ch=get() happens later, + // which is needed for newline processing. + + if (ctx.peek() == '}') // 125 + { + pc.Str().append(ctx.get()); + + if (ctx.peek() == '}') // 125 + { + pc.Str().append(ctx.get()); // in interpolated string, `}}` is escape'd `}` + } + else + { + --parseState.top().braceDepth; + } + continue; + } + stringType = parse_cs_string_start(ctx, pc); + + if (stringType != CS_STRING_NONE) + { + parseState.push(CsStringParseState(stringType)); + continue; + } + } + int lastcol = ctx.c.col; + int ch = ctx.get(); + + pc.Str().append(ch); + + if (ch == '\n') + { + pc.SetType(CT_STRING_MULTI); + pc.SetNlCount(pc.GetNlCount() + 1); + } + else if (ch == '\r') + { + pc.SetType(CT_STRING_MULTI); + } + else if (parseState.top().braceDepth > 0) + { + // do nothing. if we're in a brace, we only want the newline handling, and skip the rest. + } + else if ( (ch == '\t') + && should_escape_tabs) + { + if (parseState.top().type & CS_STRING_VERBATIM) + { + if (!cpd.warned_unable_string_replace_tab_chars) + { + cpd.warned_unable_string_replace_tab_chars = true; + + log_rule_B("warn_level_tabs_found_in_verbatim_string_literals"); + log_sev_t warnlevel = (log_sev_t)options::warn_level_tabs_found_in_verbatim_string_literals(); + + /* + * a tab char can't be replaced with \\t because escapes don't + * work in here-strings. best we can do is warn. + */ + LOG_FMT(warnlevel, "%s(%d): %s: orig line is %zu, orig col is %zu, Detected non-replaceable tab char in literal string\n", + __func__, __LINE__, cpd.filename.c_str(), pc.GetOrigLine(), pc.GetOrigCol()); + LOG_FMT(warnlevel, "%s(%d): Warning is given if doing tab-to-\\t replacement and we have found one in a C# verbatim string literal.\n", + __func__, __LINE__); + + if (warnlevel < LWARN) + { + // TODO: replace the code ?? cpd.error_count++; + } + } + } + else + { + ctx.c.col = lastcol + 2; + pc.Str().pop_back(); // remove \t + pc.Str().append("\\t"); + + continue; + } + } + else if ( ch == '\\' + && !(parseState.top().type & CS_STRING_VERBATIM)) + { + // catch escaped quote in order to avoid ending string (but also must handle \\ to avoid accidental 'escape' seq of `\\"`) + if ( ctx.peek() == '"' // 34 + || ctx.peek() == '\\') // 92 + { + pc.Str().append(ctx.get()); + } + } + else if (ch == '"') // 34 + { + if ( (parseState.top().type & CS_STRING_VERBATIM) + && (ctx.peek() == '"')) // 34 + { + // in verbatim string, `""` is escape'd `"` + pc.Str().append(ctx.get()); + } + else + { + // end of string + parseState.pop(); + + if (parseState.empty()) + { + break; + } + } + } + else if (parseState.top().type & CS_STRING_INTERPOLATED) + { + if (ch == '{') // 123 + { + if (ctx.peek() == '{') // 123 + { + pc.Str().append(ctx.get()); // in interpolated string, `{{` is escape'd `{` + } + else + { + ++parseState.top().braceDepth; + } + } + } + } + return(true); +} // parse_cs_string + + +static void parse_verbatim_string(TokenContext &ctx, Chunk &pc) +{ + pc.SetType(CT_STRING); + + // consume the initial """ + pc.Str() = ctx.get(); + pc.Str().append(ctx.get()); + pc.Str().append(ctx.get()); + + // go until we hit a zero (end of file) or a """ + while (ctx.more()) + { + size_t ch = ctx.get(); + pc.Str().append(ch); + + if ( (ch == '"') // 34 + && (ctx.peek() == '"') // 34 + && (ctx.peek(1) == '"')) // 34 + { + pc.Str().append(ctx.get()); + pc.Str().append(ctx.get()); + break; + } + + if ( (ch == '\n') + || (ch == '\r')) + { + pc.SetType(CT_STRING_MULTI); + pc.SetNlCount(pc.GetNlCount() + 1); + } + } +} + + +static bool tag_compare(const deque<int> &d, size_t a_idx, size_t b_idx, size_t len) +{ + if (a_idx != b_idx) + { + while (len-- > 0) + { + if (d[a_idx] != d[b_idx]) + { + return(false); + } + } + } + return(true); +} + + +static bool parse_cr_string(TokenContext &ctx, Chunk &pc, size_t q_idx) +{ + size_t tag_idx = ctx.c.idx + q_idx + 1; + size_t tag_len = 0; + + ctx.save(); + + // Copy the prefix + " to the string + pc.Str().clear(); + int cnt = q_idx + 1; + + while (cnt--) + { + pc.Str().append(ctx.get()); + } + + // Add the tag and get the length of the tag + while ( ctx.more() + && (ctx.peek() != '(')) + { + tag_len++; + pc.Str().append(ctx.get()); + } + + if (ctx.peek() != '(') + { + ctx.restore(); + return(false); + } + pc.SetType(CT_STRING); + + while (ctx.more()) + { + if ( (ctx.peek() == ')') // 41 + && (ctx.peek(tag_len + 1) == '"') // 34 + && tag_compare(ctx.data, tag_idx, ctx.c.idx + 1, tag_len)) + { + cnt = tag_len + 2; // for the )" + + while (cnt--) + { + pc.Str().append(ctx.get()); + } + parse_suffix(ctx, pc); + return(true); + } + + if (ctx.peek() == '\n') + { + pc.Str().append(ctx.get()); + pc.SetNlCount(pc.GetNlCount() + 1); + pc.SetType(CT_STRING_MULTI); + } + else + { + pc.Str().append(ctx.get()); + } + } + ctx.restore(); + return(false); +} // parse_cr_string + + +/** + * Count the number of characters in a word. + * The first character is already valid for a keyword + * + * @param pc The structure to update, str is an input. + * @return Whether a word was parsed (always true) + */ +static bool parse_word(TokenContext &ctx, Chunk &pc, bool skipcheck) +{ + static UncText intr_txt("@interface"); + + // The first character is already valid + pc.Str().clear(); + pc.Str().append(ctx.get()); + + while (ctx.more()) + { + size_t ch = ctx.peek(); + + if (CharTable::IsKw2(ch)) + { + pc.Str().append(ctx.get()); + } + else if ( (ch == '\\') // 92 + && (unc_tolower(ctx.peek(1)) == 'u')) // 117 + { + pc.Str().append(ctx.get()); + pc.Str().append(ctx.get()); + skipcheck = true; + } + else + { + break; + } + + // HACK: Non-ASCII character are only allowed in identifiers + if (ch > 0x7f) + { + skipcheck = true; + } + } + pc.SetType(CT_WORD); + + if (skipcheck) + { + return(true); + } + + // Detect pre-processor functions now + if ( cpd.in_preproc == CT_PP_DEFINE + && cpd.preproc_ncnl_count == 1) + { + if (ctx.peek() == '(') // 40 + { + pc.SetType(CT_MACRO_FUNC); + } + else + { + pc.SetType(CT_MACRO); + + log_rule_B("pp_ignore_define_body"); + + if (options::pp_ignore_define_body()) + { + /* + * We are setting the PP_IGNORE preproc state because the following + * chunks are part of the macro body and will have to be ignored. + */ + cpd.in_preproc = CT_PP_IGNORE; + } + } + } + else + { + // '@interface' is reserved, not an interface itself + if ( language_is_set(LANG_JAVA) + && pc.GetStr().startswith("@") + && !pc.GetStr().equals(intr_txt)) + { + pc.SetType(CT_ANNOTATION); + } + else + { + // Turn it into a keyword now + // Issue #1460 will return "COMMENT_CPP" + pc.SetType(find_keyword_type(pc.Text(), pc.GetStr().size())); + + /* Special pattern: if we're trying to redirect a preprocessor directive to PP_IGNORE, + * then ensure we're actually part of a preprocessor before doing the swap, or we'll + * end up with a function named 'define' as PP_IGNORE. This is necessary because with + * the config 'set' feature, there's no way to do a pair of tokens as a word + * substitution. */ + if ( pc.GetType() == CT_PP_IGNORE + && !cpd.in_preproc) + { + pc.SetType(find_keyword_type(pc.Text(), pc.GetStr().size())); + } + else if (pc.GetType() == CT_COMMENT_CPP) // Issue #1460 + { + size_t ch; + bool is_cs = language_is_set(LANG_CS); + + // read until EOL + while (true) + { + int bs_cnt = 0; + + while (ctx.more()) + { + ch = ctx.peek(); + + if ( (ch == '\r') + || (ch == '\n')) + { + break; + } + + if ( (ch == '\\') // 92 + && !is_cs) // backslashes aren't special in comments in C# + { + bs_cnt++; + } + else + { + bs_cnt = 0; + } + pc.Str().append(ctx.get()); + } + + /* + * If we hit an odd number of backslashes right before the newline, + * then we keep going. + */ + if ( ((bs_cnt & 1) == 0) + || !ctx.more()) + { + break; + } + + if (ctx.peek() == '\r') + { + pc.Str().append(ctx.get()); + } + + if (ctx.peek() == '\n') + { + pc.Str().append(ctx.get()); + } + pc.SetNlCount(pc.GetNlCount() + 1); + cpd.did_newline = true; + } + // Store off the end column + pc.SetOrigColEnd(ctx.c.col); + } + } + } + return(true); +} // parse_word + + +static size_t parse_attribute_specifier_sequence(TokenContext &ctx) +{ + size_t nested = 0; + size_t offset = 0; + size_t parens = 0; + auto ch1 = ctx.peek(offset++); + + while (ch1) + { + auto ch2 = ctx.peek(offset++); + + while ( ch2 == ' ' // 32 + || ch2 == '\n' + || ch2 == '\r' + || ch2 == '\t') + { + ch2 = ctx.peek(offset++); + } + + if ( nested == 0 + && ch2 != '[') // 91 + { + break; + } + + if (ch1 == '(') // 40 + { + ++parens; + ch1 = ch2; + continue; + } + + if (ch1 == ')') // 41 + { + if (parens == 0) + { + break; + } + --parens; + ch1 = ch2; + continue; + } + + if ( ch1 != '[' + && ch1 != ']') + { + ch1 = ch2; + continue; + } + + if (ch2 != ch1) + { + if (parens == 0) + { + break; + } + ch1 = ch2; + continue; + } + + if (ch1 == '[') // 91 + { + if ( nested != 0 + && parens == 0) + { + break; + } + ++nested; + } + else if (--nested == 0) + { + return(offset); + } + ch1 = ctx.peek(offset++); + } + return(0); +} // parse_attribute_specifier_sequence + + +static bool extract_attribute_specifier_sequence(TokenContext &ctx, Chunk &pc, size_t length) +{ + pc.Str().clear(); + + while (length--) + { + pc.Str().append(ctx.get()); + } + pc.SetType(CT_ATTRIBUTE); + return(true); +} // extract_attribute_specifier_sequence + + +static bool parse_whitespace(TokenContext &ctx, Chunk &pc) +{ + size_t nl_count = 0; + size_t ch = 0; + + // REVISIT: use a better whitespace detector? + while ( ctx.more() + && unc_isspace(ctx.peek())) + { + int lastcol = ctx.c.col; + ch = ctx.get(); // throw away the whitespace char + + switch (ch) + { + case '\r': + + if (ctx.expect('\n')) + { + // CRLF ending + ++LE_COUNT(CRLF); + } + else + { + // CR ending + ++LE_COUNT(CR); + } + nl_count++; + pc.SetOrigPrevSp(0); + break; + + case '\n': + // LF ending + ++LE_COUNT(LF); + nl_count++; + pc.SetOrigPrevSp(0); + break; + + case '\t': + pc.SetOrigPrevSp(pc.GetOrigPrevSp() + ctx.c.col - lastcol); + break; + + case ' ': + pc.SetOrigPrevSp(pc.GetOrigPrevSp() + 1); + break; + + default: + break; + } + } + + if (ch != 0) + { + pc.Str().clear(); + pc.SetType(nl_count ? CT_NEWLINE : CT_WHITESPACE); + pc.SetNlCount(nl_count); + pc.SetAfterTab((ctx.c.last_ch == '\t')); + return(true); + } + return(false); +} // parse_whitespace + + +static bool parse_bs_newline(TokenContext &ctx, Chunk &pc) +{ + ctx.save(); + ctx.get(); // skip the '\' + + size_t ch; + + while ( ctx.more() + && unc_isspace(ch = ctx.peek())) + { + ctx.get(); + + if ( (ch == '\r') + || (ch == '\n')) + { + if (ch == '\r') + { + ctx.expect('\n'); + } + pc.SetType(CT_NL_CONT); + pc.Str() = "\\"; + pc.SetNlCount(1); + return(true); + } + } + ctx.restore(); + return(false); +} + + +static bool parse_newline(TokenContext &ctx) +{ + ctx.save(); + + // Eat whitespace + while ( (ctx.peek() == ' ') // 32 + || (ctx.peek() == '\t')) + { + ctx.get(); + } + + if ( (ctx.peek() == '\r') + || (ctx.peek() == '\n')) + { + if (!ctx.expect('\n')) + { + ctx.get(); + ctx.expect('\n'); + } + return(true); + } + ctx.restore(); + return(false); +} + + +static void parse_pawn_pattern(TokenContext &ctx, Chunk &pc, E_Token tt) +{ + pc.Str().clear(); + pc.SetType(tt); + + while (!unc_isspace(ctx.peek())) + { + // end the pattern on an escaped newline + if (ctx.peek() == '\\') // 92 + { + size_t ch = ctx.peek(1); + + if ( (ch == '\n') + || (ch == '\r')) + { + break; + } + } + pc.Str().append(ctx.get()); + } +} + + +static bool parse_off_newlines(TokenContext &ctx, Chunk &pc) +{ + size_t nl_count = 0; + + // Parse off newlines/blank lines + while (parse_newline(ctx)) + { + nl_count++; + } + + if (nl_count > 0) + { + pc.SetNlCount(nl_count); + pc.SetType(CT_NEWLINE); + return(true); + } + return(false); +} + + +static bool parse_macro(TokenContext &ctx, Chunk &pc, const Chunk *prev_pc) +{ + if (parse_off_newlines(ctx, pc)) + { + return(true); + } + + if (parse_comment(ctx, pc)) // allow CT_COMMENT_MULTI within macros + { + return(true); + } + ctx.save(); + pc.Str().clear(); + + if (prev_pc->IsNullChunk()) + { + return(false); + } + bool continued = ( prev_pc->Is(CT_NL_CONT) + || prev_pc->Is(CT_COMMENT_MULTI)); + + while (ctx.more()) + { + size_t pk = ctx.peek(), pk1 = ctx.peek(1); + bool nl = ( pk == '\n' + || pk == '\r'); + bool nl_cont = ( pk == '\\' // 92 + && ( pk1 == '\n' + || pk1 == '\r')); + + if ( ( nl_cont + || ( continued + && nl)) + && pc.GetStr().size() > 0) + { + pc.SetType(CT_PP_IGNORE); + return(true); + } + else if (nl) + { + break; + } + pc.Str().append(ctx.get()); + } + pc.Str().clear(); + ctx.restore(); + return(false); +} // parse_macro + + +static bool parse_ignored(TokenContext &ctx, Chunk &pc) +{ + if (parse_off_newlines(ctx, pc)) + { + return(true); + } + // See if the options::enable_processing_cmt() or #pragma endasm / #endasm text is on this line + ctx.save(); + pc.Str().clear(); + + while ( ctx.more() + && (ctx.peek() != '\r') + && (ctx.peek() != '\n')) + { + pc.Str().append(ctx.get()); + } + + if (pc.GetStr().size() == 0) + { + // end of file? + return(false); + } + + // HACK: turn on if we find '#endasm' or '#pragma' and 'endasm' separated by blanks + if ( ( ( (pc.GetStr().find("#pragma ") >= 0) + || (pc.GetStr().find("#pragma ") >= 0)) + && ( (pc.GetStr().find(" endasm") >= 0) + || (pc.GetStr().find(" endasm") >= 0))) + || (pc.GetStr().find("#endasm") >= 0)) + { + cpd.unc_off = false; + ctx.restore(); + pc.Str().clear(); + return(false); + } + // Note that we aren't actually making sure this is in a comment, yet + log_rule_B("enable_processing_cmt"); + const auto &ontext = options::enable_processing_cmt(); + + if (!ontext.empty()) + { + bool found_enable_pattern = false; + + if ( ontext != UNCRUSTIFY_ON_TEXT + && options::processing_cmt_as_regex()) + { + std::wstring pc_wstring(pc.GetStr().get().cbegin(), + pc.GetStr().get().cend()); + std::wregex criteria(std::wstring(ontext.cbegin(), + ontext.cend())); + + found_enable_pattern = std::regex_search(pc_wstring.cbegin(), + pc_wstring.cend(), + criteria); + } + else + { + found_enable_pattern = (pc.GetStr().find(ontext.c_str()) >= 0); + } + + if (!found_enable_pattern) + { + pc.SetType(CT_IGNORED); + return(true); + } + } + ctx.restore(); + + // parse off whitespace leading to the comment + if (parse_whitespace(ctx, pc)) + { + pc.SetType(CT_IGNORED); + return(true); + } + + // Look for the ending comment and let it pass + if ( parse_comment(ctx, pc) + && !cpd.unc_off) + { + return(true); + } + // Reset the chunk & scan to until a newline + pc.Str().clear(); + + while ( ctx.more() + && (ctx.peek() != '\r') + && (ctx.peek() != '\n')) + { + pc.Str().append(ctx.get()); + } + + if (pc.GetStr().size() > 0) + { + pc.SetType(CT_IGNORED); + return(true); + } + return(false); +} // parse_ignored + + +static bool parse_next(TokenContext &ctx, Chunk &pc, const Chunk *prev_pc) +{ + if (!ctx.more()) + { + return(false); + } + // Save off the current column + pc.SetType(CT_NONE); + pc.SetOrigLine(ctx.c.row); + pc.SetColumn(ctx.c.col); + pc.SetOrigCol(ctx.c.col); + pc.SetNlCount(0); + pc.SetFlags(PCF_NONE); + + // If it is turned off, we put everything except newlines into CT_UNKNOWN + if (cpd.unc_off) + { + if (parse_ignored(ctx, pc)) + { + return(true); + } + } + log_rule_B("disable_processing_nl_cont"); + + // Parse macro blocks + if (options::disable_processing_nl_cont()) + { + if (parse_macro(ctx, pc, prev_pc)) + { + return(true); + } + } + + // Parse whitespace + if (parse_whitespace(ctx, pc)) + { + return(true); + } + + // Handle unknown/unhandled preprocessors + if ( cpd.in_preproc > CT_PP_BODYCHUNK + && cpd.in_preproc <= CT_PP_OTHER) + { + pc.Str().clear(); + TokenInfo ss; + ctx.save(ss); + // Chunk to a newline or comment + pc.SetType(CT_PREPROC_BODY); + size_t last = 0; + + while (ctx.more()) + { + size_t ch = ctx.peek(); + + // Fix for issue #1752 + // Ignoring extra spaces after ' \ ' for preproc body continuations + if ( last == '\\' // 92 + && ch == ' ') // 32 + { + ctx.get(); + continue; + } + + if ( (ch == '\n') + || (ch == '\r')) + { + // Back off if this is an escaped newline + if (last == '\\') // 92 + { + ctx.restore(ss); + pc.Str().pop_back(); + } + break; + } + + // Quit on a C or C++ comment start Issue #1966 + if ( (ch == '/') // 47 + && ( (ctx.peek(1) == '/') // 47 + || (ctx.peek(1) == '*'))) // 42 + { + break; + } + last = ch; + ctx.save(ss); + + pc.Str().append(ctx.get()); + } + + if (pc.GetStr().size() > 0) + { + return(true); + } + } + + // Detect backslash-newline + if ( (ctx.peek() == '\\') // 92 + && parse_bs_newline(ctx, pc)) + { + return(true); + } + + // Parse comments + if (parse_comment(ctx, pc)) + { + return(true); + } + + // Parse code placeholders + if (parse_code_placeholder(ctx, pc)) + { + return(true); + } + + if (language_is_set(LANG_CS)) + { + if (parse_cs_string(ctx, pc)) + { + return(true); + } + } + + if (language_is_set(LANG_CS | LANG_VALA)) + { + // check for non-keyword identifiers such as @if @switch, etc + // Vala also allows numeric identifiers if prefixed with '@' + if ( ctx.peek() == '@' // 64 + && ( CharTable::IsKw1(ctx.peek(1)) + || ( language_is_set(LANG_VALA) + && CharTable::IsKw2(ctx.peek(1))))) + { + parse_word(ctx, pc, true); + return(true); + } + } + + // handle VALA """ strings """ + if ( language_is_set(LANG_VALA) + && (ctx.peek() == '"') // 34 + && (ctx.peek(1) == '"') // 34 + && (ctx.peek(2) == '"')) // 34 + { + parse_verbatim_string(ctx, pc); + return(true); + } + /* + * handle C++(11) string/char literal prefixes u8|u|U|L|R including all + * possible combinations and optional R delimiters: R"delim(x)delim" + */ + auto ch = ctx.peek(); + + if ( language_is_set(LANG_C | LANG_CPP) + && ( ch == 'u' // 117 + || ch == 'U' // 85 + || ch == 'R' // 82 + || ch == 'L')) // 76 + { + auto idx = size_t{}; + auto is_real = false; + + if ( ch == 'u' // 117 + && ctx.peek(1) == '8') // 56 + { + idx = 2; + } + else if ( unc_tolower(ch) == 'u' // 117 + || ch == 'L') // 76 + { + idx++; + } + + if ( language_is_set(LANG_C | LANG_CPP) + && ctx.peek(idx) == 'R') // 82 + { + idx++; + is_real = true; + } + const auto quote = ctx.peek(idx); + + if (is_real) + { + if ( quote == '"' // 34 + && parse_cr_string(ctx, pc, idx)) + { + return(true); + } + } + else if ( ( quote == '"' // 34 + || quote == '\'') // 39 + && parse_string(ctx, pc, idx, true)) + { + return(true); + } + } + + // PAWN specific stuff + if (language_is_set(LANG_PAWN)) + { + if ( cpd.preproc_ncnl_count == 1 + && ( cpd.in_preproc == CT_PP_DEFINE + || cpd.in_preproc == CT_PP_EMIT)) + { + parse_pawn_pattern(ctx, pc, CT_MACRO); + return(true); + } + + // Check for PAWN strings: \"hi" or !"hi" or !\"hi" or \!"hi" + if ( (ctx.peek() == '\\') // 92 + || (ctx.peek() == '!')) // 33 + { + if (ctx.peek(1) == '"') // 32 + { + parse_string(ctx, pc, 1, (ctx.peek() == '!')); // 33 + return(true); + } + + if ( ( (ctx.peek(1) == '\\') // 92 + || (ctx.peek(1) == '!')) // 33 + && (ctx.peek(2) == '"')) // 32 + { + parse_string(ctx, pc, 2, false); + return(true); + } + } + + // handle PAWN preprocessor args %0 .. %9 + if ( cpd.in_preproc == CT_PP_DEFINE + && (ctx.peek() == '%') // 37 + && unc_isdigit(ctx.peek(1))) + { + pc.Str().clear(); + pc.Str().append(ctx.get()); + pc.Str().append(ctx.get()); + pc.SetType(CT_WORD); + return(true); + } + } + // Parse strings and character constants + + if (parse_number(ctx, pc)) + { + return(true); + } + + if (language_is_set(LANG_D)) + { + // D specific stuff + if (d_parse_string(ctx, pc)) + { + return(true); + } + } + else + { + // Not D stuff + + // Check for L'a', L"abc", 'a', "abc", <abc> strings + ch = ctx.peek(); + size_t ch1 = ctx.peek(1); + + if ( ( ( (ch == 'L') // 76 + || (ch == 'S')) // 83 + && ( (ch1 == '"') // 34 + || (ch1 == '\''))) // 39 + || (ch == '"') // 34 + || (ch == '\'') // 39 + || ( (ch == '<') // 60 + && cpd.in_preproc == CT_PP_INCLUDE)) + { + parse_string(ctx, pc, unc_isalpha(ch) ? 1 : 0, true); + + if (cpd.in_preproc == CT_PP_INCLUDE) + { + pc.SetParentType(CT_PP_INCLUDE); + } + return(true); + } + + if ( (ch == '<') // 60 + && cpd.in_preproc == CT_PP_DEFINE) + { + if (Chunk::GetTail()->Is(CT_MACRO)) + { + // We have "#define XXX <", assume '<' starts an include string + parse_string(ctx, pc, 0, false); + return(true); + } + } + /* Inside clang's __has_include() could be "path/to/file.h" or system-style <path/to/file.h> */ + Chunk *tail = Chunk::GetTail(); + + if ( (ch == '(') // 40 + && (tail->IsNotNullChunk()) + && ( tail->Is(CT_CNG_HASINC) + || tail->Is(CT_CNG_HASINCN))) + { + parse_string(ctx, pc, 0, false); + return(true); + } + } + + // Check for Vala string templates + if ( language_is_set(LANG_VALA) + && (ctx.peek() == '@')) // 64 + { + size_t nc = ctx.peek(1); + + if (nc == '"') // 34 + { + // literal string + parse_string(ctx, pc, 1, true); + return(true); + } + } + + // Check for Objective C literals + if ( language_is_set(LANG_OC) + && (ctx.peek() == '@')) // 64 + { + size_t nc = ctx.peek(1); + + if (nc == 'R') // Issue #2720 82 + { + if (ctx.peek(2) == '"') // 34 + { + if (parse_cr_string(ctx, pc, 2)) // Issue #3027 + { + return(true); + } + // parse string without escaping + parse_string(ctx, pc, 2, false); + return(true); + } + } + + if ( (nc == '"') // 34 + || (nc == '\'')) // 39 + { + // literal string + parse_string(ctx, pc, 1, true); + return(true); + } + + if ( (nc >= '0') + && (nc <= '9')) + { + // literal number + pc.Str().append(ctx.get()); // store the '@' + parse_number(ctx, pc); + return(true); + } + } + + // Check for pawn/ObjectiveC/Java and normal identifiers + if ( CharTable::IsKw1(ctx.peek()) + || ( (ctx.peek() == '\\') // 92 + && (unc_tolower(ctx.peek(1)) == 'u')) // 117 + || ( (ctx.peek() == '@') // 64 + && CharTable::IsKw1(ctx.peek(1)))) + { + parse_word(ctx, pc, false); + return(true); + } + + // Check for C++11/14/17/20 attribute specifier sequences + if ( language_is_set(LANG_CPP) + && ctx.peek() == '[') // 91 + { + if ( !language_is_set(LANG_OC) + || ( prev_pc->IsNotNullChunk() + && !prev_pc->Is(CT_OC_AT))) + { + if (auto length = parse_attribute_specifier_sequence(ctx)) + { + extract_attribute_specifier_sequence(ctx, pc, length); + return(true); + } + } + } + // see if we have a punctuator + char punc_txt[7]; + + punc_txt[0] = ctx.peek(); + punc_txt[1] = ctx.peek(1); + punc_txt[2] = ctx.peek(2); + punc_txt[3] = ctx.peek(3); + punc_txt[4] = ctx.peek(4); + punc_txt[5] = ctx.peek(5); + punc_txt[6] = '\0'; + const chunk_tag_t *punc; + + if ((punc = find_punctuator(punc_txt, cpd.lang_flags)) != nullptr) + { + int cnt = strlen(punc->tag); + + while (cnt--) + { + pc.Str().append(ctx.get()); + } + pc.SetType(punc->type); + pc.SetFlagBits(PCF_PUNCTUATOR); + return(true); + } + /* When parsing C/C++ files and running into some unknown token, + * check if matches Objective-C as a last resort, before + * considering it as garbage. + */ + int probe_lang_flags = 0; + + if (language_is_set(LANG_C | LANG_CPP)) + { + probe_lang_flags = cpd.lang_flags | LANG_OC; + } + + if (probe_lang_flags != 0) + { + if ((punc = find_punctuator(punc_txt, probe_lang_flags)) != nullptr) + { + cpd.lang_flags = probe_lang_flags; + int cnt = strlen(punc->tag); + + while (cnt--) + { + pc.Str().append(ctx.get()); + } + pc.SetType(punc->type); + pc.SetFlagBits(PCF_PUNCTUATOR); + return(true); + } + } + // throw away this character + pc.SetType(CT_UNKNOWN); + pc.Str().append(ctx.get()); + + LOG_FMT(LWARN, "%s:%zu Garbage in col %zu: %x\n", + cpd.filename.c_str(), pc.GetOrigLine(), ctx.c.col, pc.GetStr()[0]); + exit(EX_SOFTWARE); +} // parse_next + + +int find_disable_processing_comment_marker(const UncText &text, + std::size_t start_idx) +{ + log_rule_B("disable_processing_cmt"); + const auto &offtext = options::disable_processing_cmt(); + int idx = -1; + + if ( !offtext.empty() + && start_idx < text.size()) + { + if ( offtext != UNCRUSTIFY_OFF_TEXT + && options::processing_cmt_as_regex()) + { + std::wsmatch match; + std::wstring pc_wstring(text.get().cbegin() + start_idx, + text.get().cend()); + std::wregex criteria(std::wstring(offtext.cbegin(), + offtext.cend())); + + std::regex_search(pc_wstring.cbegin(), + pc_wstring.cend(), + match, + criteria); + + if (!match.empty()) + { + idx = int(match.position() + start_idx); + } + } + else + { + idx = text.find(offtext.c_str(), + start_idx); + + if (idx >= 0) + { + idx += int(offtext.size()); + } + } + + /** + * update the position to the start of the current line + */ + while ( idx > 0 + && text[idx - 1] != '\n') + { + --idx; + } + } + return(idx); +} // find_disable_processing_comment_marker + + +int find_enable_processing_comment_marker(const UncText &text, + std::size_t start_idx) +{ + log_rule_B("enable_processing_cmt"); + const auto &ontext = options::enable_processing_cmt(); + int idx = -1; + + if ( !ontext.empty() + && start_idx < text.size()) + { + if ( ontext != UNCRUSTIFY_ON_TEXT + && options::processing_cmt_as_regex()) + { + std::wsmatch match; + std::wstring pc_wstring(text.get().cbegin() + start_idx, + text.get().cend()); + std::wregex criteria(std::wstring(ontext.cbegin(), + ontext.cend())); + + std::regex_search(pc_wstring.cbegin(), + pc_wstring.cend(), + match, + criteria); + + if (!match.empty()) + { + idx = int(start_idx + match.position() + match.size()); + } + } + else + { + idx = text.find(ontext.c_str(), + start_idx); + + if (idx >= 0) + { + idx += int(ontext.size()); + } + } + + /** + * update the position to the end of the current line + */ + if (idx >= 0) + { + while ( idx < int(text.size()) + && text[idx] != '\n') + { + ++idx; + } + } + } + return(idx); +} // find_enable_processing_comment_marker + + +void tokenize(const deque<int> &data, Chunk *ref) +{ + TokenContext ctx(data); + Chunk chunk; + Chunk *pc = Chunk::NullChunkPtr; + Chunk *rprev = Chunk::NullChunkPtr; + bool last_was_tab = false; + size_t prev_sp = 0; + int num_stripped = 0; // Issue #1966 + + cpd.unc_stage = unc_stage_e::TOKENIZE; + + while (ctx.more()) + { + chunk.Reset(); + chunk.SetPpLevel(0); + + if (!parse_next(ctx, chunk, pc)) + { + LOG_FMT(LERR, "%s:%zu Bailed before the end?\n", + cpd.filename.c_str(), ctx.c.row); + exit(EX_SOFTWARE); + } + + if ( language_is_set(LANG_JAVA) + && chunk.GetType() == CT_MEMBER + && !memcmp(chunk.Text(), "->", 2)) + { + chunk.SetType(CT_LAMBDA); + } + + // Don't create an entry for whitespace + if (chunk.GetType() == CT_WHITESPACE) + { + last_was_tab = chunk.GetAfterTab(); + prev_sp = chunk.GetOrigPrevSp(); + continue; + } + chunk.SetOrigPrevSp(prev_sp); + prev_sp = 0; + + if (chunk.GetType() == CT_NEWLINE) + { + last_was_tab = chunk.GetAfterTab(); + chunk.SetAfterTab(false); + chunk.Str().clear(); + } + else if (chunk.GetType() == CT_NL_CONT) + { + last_was_tab = chunk.GetAfterTab(); + chunk.SetAfterTab(false); + chunk.Str() = "\\\n"; + } + else + { + chunk.SetAfterTab(last_was_tab); + last_was_tab = false; + } + num_stripped = 0; // Issue #1966 and #3565 + + if (chunk.GetType() != CT_IGNORED) + { + // Issue #1338 + // Strip trailing whitespace (for CPP comments and PP blocks) + while ( (chunk.GetStr().size() > 0) + && ( (chunk.GetStr()[chunk.GetStr().size() - 1] == ' ') // 32 + || (chunk.GetStr()[chunk.GetStr().size() - 1] == '\t'))) + { + // If comment contains backslash '\' followed by whitespace chars, keep last one; + // this will prevent it from turning '\' into line continuation. + if ( (chunk.GetStr().size() > 1) + && (chunk.GetStr()[chunk.GetStr().size() - 2] == '\\')) + { + break; + } + chunk.Str().pop_back(); + num_stripped++; // Issue #1966 + } + } + // Store off the end column + chunk.SetOrigColEnd(ctx.c.col - num_stripped); // Issue #1966 and #3565 + + // Make the whitespace we disposed of be attributed to the next chunk + prev_sp = num_stripped; + + // Add the chunk to the list + rprev = pc; + + if (rprev->IsNotNullChunk()) + { + pc->SetFlagBits(rprev->GetFlags() & PCF_COPY_FLAGS); + + // a newline can't be in a preprocessor + if (pc->Is(CT_NEWLINE)) + { + pc->ResetFlagBits(PCF_IN_PREPROC); + } + } + + if (ref->IsNotNullChunk()) + { + chunk.SetFlagBits(PCF_INSERTED); + } + else + { + chunk.ResetFlagBits(PCF_INSERTED); + } + pc = chunk.CopyAndAddBefore(ref); + + // A newline marks the end of a preprocessor + if (pc->Is(CT_NEWLINE)) // || pc->Is(CT_COMMENT_MULTI)) + { + cpd.in_preproc = CT_NONE; + cpd.preproc_ncnl_count = 0; + } + + // Disable indentation when #asm directive found + if (pc->Is(CT_PP_ASM)) + { + LOG_FMT(LBCTRL, "Found a directive %s on line %zu\n", "#asm", pc->GetOrigLine()); + cpd.unc_off = true; + } + + // Special handling for preprocessor stuff + if (cpd.in_preproc != CT_NONE) + { + pc->SetFlagBits(PCF_IN_PREPROC); + // Issue #2225 + LOG_FMT(LBCTRL, "%s(%d): orig line is %zu, orig col is %zu, type is %s, parentType is %s\n", + __func__, __LINE__, pc->GetOrigLine(), pc->GetOrigCol(), + get_token_name(pc->GetType()), get_token_name(pc->GetParentType())); + + if ( pc->Is(CT_STRING_MULTI) + && pc->GetParentType() == CT_PP_INCLUDE) + { + LOG_FMT(LWARN, "%s:%zu: File name is not possible %s\n", + cpd.filename.c_str(), pc->GetOrigLine(), pc->Text()); + exit(EX_SOFTWARE); + } + + // Count words after the preprocessor + if (!pc->IsCommentOrNewline()) + { + cpd.preproc_ncnl_count++; + } + + // Disable indentation if a #pragma asm directive is found + if (cpd.in_preproc == CT_PP_PRAGMA) + { + if (memcmp(pc->Text(), "asm", 3) == 0) + { + LOG_FMT(LBCTRL, "Found a pragma %s on line %zu\n", "asm", pc->GetOrigLine()); + cpd.unc_off = true; + } + } + + // Figure out the type of preprocessor for #include parsing + if (cpd.in_preproc == CT_PREPROC) + { + if ( pc->GetType() < CT_PP_DEFINE + || pc->GetType() > CT_PP_OTHER) + { + pc->SetType(CT_PP_OTHER); + } + cpd.in_preproc = pc->GetType(); + } + else if (cpd.in_preproc == CT_PP_IGNORE) + { + if ( !pc->Is(CT_NL_CONT) + && !pc->IsComment()) // Issue #1966 + { + pc->SetType(CT_PP_IGNORE); + } + } + else if ( cpd.in_preproc == CT_PP_DEFINE + && pc->Is(CT_PAREN_CLOSE) + && options::pp_ignore_define_body()) + { + log_rule_B("pp_ignore_define_body"); + // When we have a PAREN_CLOSE in a PP_DEFINE we should be terminating a MACRO_FUNC + // arguments list. Therefore we can enter the PP_IGNORE state and ignore next chunks. + cpd.in_preproc = CT_PP_IGNORE; + } + } + else + { + // Check for a preprocessor start + if ( pc->Is(CT_POUND) + && ( rprev->IsNullChunk() + || rprev->Is(CT_NEWLINE))) + { + pc->SetType(CT_PREPROC); + pc->SetFlagBits(PCF_IN_PREPROC); + cpd.in_preproc = CT_PREPROC; + } + } + + if (pc->Is(CT_NEWLINE)) + { + LOG_FMT(LBCTRL, "%s(%d): orig line is %zu, orig col is %zu, <Newline>, nl is %zu\n", + __func__, __LINE__, pc->GetOrigLine(), pc->GetOrigCol(), pc->GetNlCount()); + } + else if (pc->Is(CT_VBRACE_OPEN)) + { + LOG_FMT(LBCTRL, "%s(%d): orig line is %zu, orig col is %zu, type is %s, orig col end is %zu\n", + __func__, __LINE__, pc->GetOrigLine(), pc->GetOrigCol(), get_token_name(pc->GetType()), pc->GetOrigColEnd()); + } + else + { + char copy[1000]; + LOG_FMT(LBCTRL, "%s(%d): orig line is %zu, orig col is %zu, Text() '%s', type is %s, orig col end is %zu\n", + __func__, __LINE__, pc->GetOrigLine(), pc->GetOrigCol(), pc->ElidedText(copy), get_token_name(pc->GetType()), pc->GetOrigColEnd()); + } + } + // Set the cpd.newline string for this file + log_rule_B("newlines"); + + if ( options::newlines() == LE_LF + || ( options::newlines() == LE_AUTO + && (LE_COUNT(LF) >= LE_COUNT(CRLF)) + && (LE_COUNT(LF) >= LE_COUNT(CR)))) + { + // LF line ends + cpd.newline = "\n"; + LOG_FMT(LLINEENDS, "Using LF line endings\n"); + } + else if ( options::newlines() == LE_CRLF + || ( options::newlines() == LE_AUTO + && (LE_COUNT(CRLF) >= LE_COUNT(LF)) + && (LE_COUNT(CRLF) >= LE_COUNT(CR)))) + { + // CRLF line ends + cpd.newline = "\r\n"; + LOG_FMT(LLINEENDS, "Using CRLF line endings\r\n"); + } + else + { + // CR line ends + cpd.newline = "\r"; + LOG_FMT(LLINEENDS, "Using CR line endings\n"); + } +} // tokenize |