diff options
Diffstat (limited to 'debian/uncrustify-trinity/uncrustify-trinity-0.75.0/scripts/tokenizer.py')
-rwxr-xr-x | debian/uncrustify-trinity/uncrustify-trinity-0.75.0/scripts/tokenizer.py | 316 |
1 files changed, 316 insertions, 0 deletions
diff --git a/debian/uncrustify-trinity/uncrustify-trinity-0.75.0/scripts/tokenizer.py b/debian/uncrustify-trinity/uncrustify-trinity-0.75.0/scripts/tokenizer.py new file mode 100755 index 00000000..0bc33bac --- /dev/null +++ b/debian/uncrustify-trinity/uncrustify-trinity-0.75.0/scripts/tokenizer.py @@ -0,0 +1,316 @@ +#! /usr/bin/env python +# tokenize.py +# +# Parses a C/C++/C#/D/Java/Pawn/whatever file in an array of +# tuples (string, type) +# + +# punctuator lookup table +punc_table = [ + [ '!', 25, 26, '!' ], # 0: '!' + [ '#', 24, 35, '#' ], # 1: '#' + [ '$', 23, 0, '$' ], # 2: '$' + [ '%', 22, 36, '%' ], # 3: '%' + [ '&', 21, 41, '&' ], # 4: '&' + [ '(', 20, 0, '(' ], # 5: '(' + [ ')', 19, 0, ')' ], # 6: ')' + [ '*', 18, 43, '*' ], # 7: '*' + [ '+', 17, 44, '+' ], # 8: '+' + [ ',', 16, 0, ',' ], # 9: ',' + [ '-', 15, 46, '-' ], # 10: '-' + [ '.', 14, 50, '.' ], # 11: '.' + [ '/', 13, 53, '/' ], # 12: '/' + [ ':', 12, 54, ':' ], # 13: ':' + [ ';', 11, 0, ';' ], # 14: ';' + [ '<', 10, 56, '<' ], # 15: '<' + [ '=', 9, 63, '=' ], # 16: '=' + [ '>', 8, 65, '>' ], # 17: '>' + [ '?', 7, 0, '?' ], # 18: '?' + [ '[', 6, 70, '[' ], # 19: '[' + [ ']', 5, 0, ']' ], # 20: ']' + [ '^', 4, 71, '^' ], # 21: '^' + [ '{', 3, 0, '{' ], # 22: '{' + [ '|', 2, 72, '|' ], # 23: '|' + [ '}', 1, 0, '}' ], # 24: '}' + [ '~', 0, 74, '~' ], # 25: '~' + [ '<', 3, 30, '!<' ], # 26: '!<' + [ '=', 2, 33, '!=' ], # 27: '!=' + [ '>', 1, 34, '!>' ], # 28: '!>' + [ '~', 0, 0, '!~' ], # 29: '!~' + [ '=', 1, 0, '!<=' ], # 30: '!<=' + [ '>', 0, 32, '!<>' ], # 31: '!<>' + [ '=', 0, 0, '!<>='], # 32: '!<>=' + [ '=', 0, 0, '!==' ], # 33: '!==' + [ '=', 0, 0, '!>=' ], # 34: '!>=' + [ '#', 0, 0, '##' ], # 35: '##' + [ ':', 2, 39, '%:' ], # 36: '%:' + [ '=', 1, 0, '%=' ], # 37: '%=' + [ '>', 0, 0, '%>' ], # 38: '%>' + [ '%', 0, 40, None ], # 39: '%:%' + [ ':', 0, 0, '%:%:'], # 40: '%:%:' + [ '&', 1, 0, '&&' ], # 41: '&&' + [ '=', 0, 0, '&=' ], # 42: '&=' + [ '=', 0, 0, '*=' ], # 43: '*=' + [ '+', 1, 0, '++' ], # 44: '++' + [ '=', 0, 0, '+=' ], # 45: '+=' + [ '-', 2, 0, '--' ], # 46: '--' + [ '=', 1, 0, '-=' ], # 47: '-=' + [ '>', 0, 49, '->' ], # 48: '->' + [ '*', 0, 0, '->*' ], # 49: '->*' + [ '*', 1, 0, '.*' ], # 50: '.*' + [ '.', 0, 52, '..' ], # 51: '..' + [ '.', 0, 0, '...' ], # 52: '...' + [ '=', 0, 0, '/=' ], # 53: '/=' + [ ':', 1, 0, '::' ], # 54: '::' + [ '>', 0, 0, ':>' ], # 55: ':>' + [ '%', 4, 0, '<%' ], # 56: '<%' + [ ':', 3, 0, '<:' ], # 57: '<:' + [ '<', 2, 61, '<<' ], # 58: '<<' + [ '=', 1, 0, '<=' ], # 59: '<=' + [ '>', 0, 62, '<>' ], # 60: '<>' + [ '=', 0, 0, '<<=' ], # 61: '<<=' + [ '=', 0, 0, '<>=' ], # 62: '<>=' + [ '=', 0, 64, '==' ], # 63: '==' + [ '=', 0, 0, '===' ], # 64: '===' + [ '=', 1, 0, '>=' ], # 65: '>=' + [ '>', 0, 67, '>>' ], # 66: '>>' + [ '=', 1, 0, '>>=' ], # 67: '>>=' + [ '>', 0, 69, '>>>' ], # 68: '>>>' + [ '=', 0, 0, '>>>='], # 69: '>>>=' + [ ']', 0, 0, '[]' ], # 70: '[]' + [ '=', 0, 0, '^=' ], # 71: '^=' + [ '=', 1, 0, '|=' ], # 72: '|=' + [ '|', 0, 0, '||' ], # 73: '||' + [ '=', 1, 0, '~=' ], # 74: '~=' + [ '~', 0, 0, '~~' ], # 75: '~~' +] + + +# +# Token types: +# 0 = newline +# 1 = punctuator +# 2 = integer +# 3 = float +# 4 = string +# 5 = identifier +# +class Tokenizer: + def __init__(self): + self.tokens = [] + self.text = '' + self.text_idx = 0 + + def tokenize_text(self, in_text): + self.tokens = [] + self.text = in_text + self.text_idx = 0 + + print(in_text) + try: + while self.text_idx < len(self.text): + if self.parse_whitespace(): + continue + elif self.text[self.text_idx] == '\\' and self.text[self.text_idx + 1] == '\n': + self.text_idx += 2 + continue + elif self.parse_comment(): + continue + elif self.parse_number(): + continue + elif self.parse_identifier(): + continue + elif self.parse_string(): + continue + elif self.parse_punctuator(): + continue + else: + print("confused: %s" % self.text[self.text_idx:]) + break + except: + print("bombed") + raise + + def parse_whitespace(self): + start_idx = self.text_idx + hit_newline = False + while self.text_idx < len(self.text): + if self.text[self.text_idx] in '\n\r': + hit_newline = True + elif not self.text[self.text_idx] in ' \t': + break + self.text_idx += 1 + + if hit_newline: + self.tokens.append(('\n', 0)) + return start_idx != self.text_idx + + def parse_comment(self): + if not self.text[self.text_idx] == '/' or not self.text[self.text_idx + 1] in '/*': + return False + if self.text[self.text_idx + 1] == '/': + while self.text_idx < len(self.text): + if self.text[self.text_idx] in '\n\r': + break + self.text_idx += 1 + else: + while self.text_idx < len(self.text) - 1: + if self.text[self.text_idx] == '*' and self.text[self.text_idx + 1] == '/': + self.text_idx += 2 + break + self.text_idx += 1 + return True + + def parse_identifier(self): + if not self.text[self.text_idx].upper() in '@_ABCDEFGHIJKLMNOPQRSTUVWXYZ': + return False + start_idx = self.text_idx + while self.text_idx < len(self.text) and \ + self.text[self.text_idx].upper() in '@_ABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890': + self.text_idx += 1 + self.tokens.append((self.text[start_idx : self.text_idx], 5)) + return True + + def parse_string(self): + starter = 0 + start_ch = self.text[self.text_idx] + if start_ch == 'L': + starter = 1 + start_ch = self.text[self.text_idx + 1] + if not start_ch in '"\'': + return False + start_idx = self.text_idx + self.text_idx += starter + 1 + escaped = False + while self.text_idx < len(self.text): + if escaped: + escaped = False + else: + if self.text[self.text_idx] == '\\': + escaped = True + elif self.text[self.text_idx] == start_ch: + self.text_idx += 1 + break + self.text_idx += 1 + + self.tokens.append((self.text[start_idx : self.text_idx], 4)) + return True + + # Checks for punctuators + # Returns whether a punctuator was consumed (True or False) + def parse_punctuator(self): + tab_idx = 0 + punc_len = 0 + saved_punc = None + while 1: + pte = punc_table[tab_idx] + if pte[0] == self.text[self.text_idx]: + if pte[3] is not None: + saved_punc = pte[3] + self.text_idx += 1 + tab_idx = pte[2] + if tab_idx == 0: + break + elif pte[1] == 0: + break + else: + tab_idx += 1 + if saved_punc is not None: + self.tokens.append((saved_punc, 1)) + return True + return False + + def parse_number(self): + # A number must start with a digit or a dot followed by a digit + ch = self.text[self.text_idx] + if not ch.isdigit() and (ch != '.' or not self.text[self.text_idx + 1].isdigit()): + return False + token_type = 2 # integer + if ch == '.': + token_type = 3 # float + did_hex = False + start_idx = self.text_idx + + # Check for Hex, Octal, or Binary + # Note that only D and Pawn support binary, but who cares? + # + if ch == '0': + self.text_idx += 1 + ch = self.text[self.text_idx].upper() + if ch == 'X': # hex + did_hex = True + self.text_idx += 1 + while self.text[self.text_idx] in '_0123456789abcdefABCDEF': + self.text_idx += 1 + elif ch == 'B': # binary + self.text_idx += 1 + while self.text[self.text_idx] in '_01': + self.text_idx += 1 + elif ch >= '0' and ch <= 7: # octal (but allow decimal) + self.text_idx += 1 + while self.text[self.text_idx] in '_0123456789': + self.text_idx += 1 + else: + # either just 0 or 0.1 or 0UL, etc + pass + else: + # Regular int or float + while self.text[self.text_idx] in '_0123456789': + self.text_idx += 1 + + # Check if we stopped on a decimal point + if self.text[self.text_idx] == '.': + self.text_idx += 1 + token_type = 3 # float + if did_hex: + while self.text[self.text_idx] in '_0123456789abcdefABCDEF': + self.text_idx += 1 + else: + while self.text[self.text_idx] in '_0123456789': + self.text_idx += 1 + + # Check exponent + # Valid exponents per language (not that it matters): + # C/C++/D/Java: eEpP + # C#/Pawn: eE + if self.text[self.text_idx] in 'eEpP': + token_type = 3 # float + self.text_idx += 1 + if self.text[self.text_idx] in '+-': + self.text_idx += 1 + while self.text[self.text_idx] in '_0123456789': + self.text_idx += 1 + + # Check the suffixes + # Valid suffixes per language (not that it matters): + # Integer Float + # C/C++: uUlL lLfF + # C#: uUlL fFdDMm + # D: uUL ifFL + # Java: lL fFdD + # Pawn: (none) (none) + # + # Note that i, f, d, and m only appear in floats. + while 1: + if self.text[self.text_idx] in 'tTfFdDmM': + token_type = 3 # float + elif not self.text[self.text_idx] in 'lLuU': + break + self.text_idx += 1 + + self.tokens.append((self.text[start_idx : self.text_idx], token_type)) + return True + +text = """ +1.23+4-3*16%2 *sin(1.e-3 + .5p32) "hello" and "hello\\"there" +123 // some comment +a = b + c; +#define abc \\ + 5 +d = 5 /* hello */ + 3; +""" + +t = Tokenizer() +t.tokenize_text(text) +print(t.tokens) + |