summaryrefslogtreecommitdiffstats
path: root/debian/uncrustify-trinity/uncrustify-trinity-0.75.0/scripts/tokenizer.py
diff options
context:
space:
mode:
Diffstat (limited to 'debian/uncrustify-trinity/uncrustify-trinity-0.75.0/scripts/tokenizer.py')
-rwxr-xr-xdebian/uncrustify-trinity/uncrustify-trinity-0.75.0/scripts/tokenizer.py316
1 files changed, 316 insertions, 0 deletions
diff --git a/debian/uncrustify-trinity/uncrustify-trinity-0.75.0/scripts/tokenizer.py b/debian/uncrustify-trinity/uncrustify-trinity-0.75.0/scripts/tokenizer.py
new file mode 100755
index 00000000..0bc33bac
--- /dev/null
+++ b/debian/uncrustify-trinity/uncrustify-trinity-0.75.0/scripts/tokenizer.py
@@ -0,0 +1,316 @@
+#! /usr/bin/env python
+# tokenize.py
+#
+# Parses a C/C++/C#/D/Java/Pawn/whatever file in an array of
+# tuples (string, type)
+#
+
+# punctuator lookup table
+punc_table = [
+ [ '!', 25, 26, '!' ], # 0: '!'
+ [ '#', 24, 35, '#' ], # 1: '#'
+ [ '$', 23, 0, '$' ], # 2: '$'
+ [ '%', 22, 36, '%' ], # 3: '%'
+ [ '&', 21, 41, '&' ], # 4: '&'
+ [ '(', 20, 0, '(' ], # 5: '('
+ [ ')', 19, 0, ')' ], # 6: ')'
+ [ '*', 18, 43, '*' ], # 7: '*'
+ [ '+', 17, 44, '+' ], # 8: '+'
+ [ ',', 16, 0, ',' ], # 9: ','
+ [ '-', 15, 46, '-' ], # 10: '-'
+ [ '.', 14, 50, '.' ], # 11: '.'
+ [ '/', 13, 53, '/' ], # 12: '/'
+ [ ':', 12, 54, ':' ], # 13: ':'
+ [ ';', 11, 0, ';' ], # 14: ';'
+ [ '<', 10, 56, '<' ], # 15: '<'
+ [ '=', 9, 63, '=' ], # 16: '='
+ [ '>', 8, 65, '>' ], # 17: '>'
+ [ '?', 7, 0, '?' ], # 18: '?'
+ [ '[', 6, 70, '[' ], # 19: '['
+ [ ']', 5, 0, ']' ], # 20: ']'
+ [ '^', 4, 71, '^' ], # 21: '^'
+ [ '{', 3, 0, '{' ], # 22: '{'
+ [ '|', 2, 72, '|' ], # 23: '|'
+ [ '}', 1, 0, '}' ], # 24: '}'
+ [ '~', 0, 74, '~' ], # 25: '~'
+ [ '<', 3, 30, '!<' ], # 26: '!<'
+ [ '=', 2, 33, '!=' ], # 27: '!='
+ [ '>', 1, 34, '!>' ], # 28: '!>'
+ [ '~', 0, 0, '!~' ], # 29: '!~'
+ [ '=', 1, 0, '!<=' ], # 30: '!<='
+ [ '>', 0, 32, '!<>' ], # 31: '!<>'
+ [ '=', 0, 0, '!<>='], # 32: '!<>='
+ [ '=', 0, 0, '!==' ], # 33: '!=='
+ [ '=', 0, 0, '!>=' ], # 34: '!>='
+ [ '#', 0, 0, '##' ], # 35: '##'
+ [ ':', 2, 39, '%:' ], # 36: '%:'
+ [ '=', 1, 0, '%=' ], # 37: '%='
+ [ '>', 0, 0, '%>' ], # 38: '%>'
+ [ '%', 0, 40, None ], # 39: '%:%'
+ [ ':', 0, 0, '%:%:'], # 40: '%:%:'
+ [ '&', 1, 0, '&&' ], # 41: '&&'
+ [ '=', 0, 0, '&=' ], # 42: '&='
+ [ '=', 0, 0, '*=' ], # 43: '*='
+ [ '+', 1, 0, '++' ], # 44: '++'
+ [ '=', 0, 0, '+=' ], # 45: '+='
+ [ '-', 2, 0, '--' ], # 46: '--'
+ [ '=', 1, 0, '-=' ], # 47: '-='
+ [ '>', 0, 49, '->' ], # 48: '->'
+ [ '*', 0, 0, '->*' ], # 49: '->*'
+ [ '*', 1, 0, '.*' ], # 50: '.*'
+ [ '.', 0, 52, '..' ], # 51: '..'
+ [ '.', 0, 0, '...' ], # 52: '...'
+ [ '=', 0, 0, '/=' ], # 53: '/='
+ [ ':', 1, 0, '::' ], # 54: '::'
+ [ '>', 0, 0, ':>' ], # 55: ':>'
+ [ '%', 4, 0, '<%' ], # 56: '<%'
+ [ ':', 3, 0, '<:' ], # 57: '<:'
+ [ '<', 2, 61, '<<' ], # 58: '<<'
+ [ '=', 1, 0, '<=' ], # 59: '<='
+ [ '>', 0, 62, '<>' ], # 60: '<>'
+ [ '=', 0, 0, '<<=' ], # 61: '<<='
+ [ '=', 0, 0, '<>=' ], # 62: '<>='
+ [ '=', 0, 64, '==' ], # 63: '=='
+ [ '=', 0, 0, '===' ], # 64: '==='
+ [ '=', 1, 0, '>=' ], # 65: '>='
+ [ '>', 0, 67, '>>' ], # 66: '>>'
+ [ '=', 1, 0, '>>=' ], # 67: '>>='
+ [ '>', 0, 69, '>>>' ], # 68: '>>>'
+ [ '=', 0, 0, '>>>='], # 69: '>>>='
+ [ ']', 0, 0, '[]' ], # 70: '[]'
+ [ '=', 0, 0, '^=' ], # 71: '^='
+ [ '=', 1, 0, '|=' ], # 72: '|='
+ [ '|', 0, 0, '||' ], # 73: '||'
+ [ '=', 1, 0, '~=' ], # 74: '~='
+ [ '~', 0, 0, '~~' ], # 75: '~~'
+]
+
+
+#
+# Token types:
+# 0 = newline
+# 1 = punctuator
+# 2 = integer
+# 3 = float
+# 4 = string
+# 5 = identifier
+#
+class Tokenizer:
+ def __init__(self):
+ self.tokens = []
+ self.text = ''
+ self.text_idx = 0
+
+ def tokenize_text(self, in_text):
+ self.tokens = []
+ self.text = in_text
+ self.text_idx = 0
+
+ print(in_text)
+ try:
+ while self.text_idx < len(self.text):
+ if self.parse_whitespace():
+ continue
+ elif self.text[self.text_idx] == '\\' and self.text[self.text_idx + 1] == '\n':
+ self.text_idx += 2
+ continue
+ elif self.parse_comment():
+ continue
+ elif self.parse_number():
+ continue
+ elif self.parse_identifier():
+ continue
+ elif self.parse_string():
+ continue
+ elif self.parse_punctuator():
+ continue
+ else:
+ print("confused: %s" % self.text[self.text_idx:])
+ break
+ except:
+ print("bombed")
+ raise
+
+ def parse_whitespace(self):
+ start_idx = self.text_idx
+ hit_newline = False
+ while self.text_idx < len(self.text):
+ if self.text[self.text_idx] in '\n\r':
+ hit_newline = True
+ elif not self.text[self.text_idx] in ' \t':
+ break
+ self.text_idx += 1
+
+ if hit_newline:
+ self.tokens.append(('\n', 0))
+ return start_idx != self.text_idx
+
+ def parse_comment(self):
+ if not self.text[self.text_idx] == '/' or not self.text[self.text_idx + 1] in '/*':
+ return False
+ if self.text[self.text_idx + 1] == '/':
+ while self.text_idx < len(self.text):
+ if self.text[self.text_idx] in '\n\r':
+ break
+ self.text_idx += 1
+ else:
+ while self.text_idx < len(self.text) - 1:
+ if self.text[self.text_idx] == '*' and self.text[self.text_idx + 1] == '/':
+ self.text_idx += 2
+ break
+ self.text_idx += 1
+ return True
+
+ def parse_identifier(self):
+ if not self.text[self.text_idx].upper() in '@_ABCDEFGHIJKLMNOPQRSTUVWXYZ':
+ return False
+ start_idx = self.text_idx
+ while self.text_idx < len(self.text) and \
+ self.text[self.text_idx].upper() in '@_ABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890':
+ self.text_idx += 1
+ self.tokens.append((self.text[start_idx : self.text_idx], 5))
+ return True
+
+ def parse_string(self):
+ starter = 0
+ start_ch = self.text[self.text_idx]
+ if start_ch == 'L':
+ starter = 1
+ start_ch = self.text[self.text_idx + 1]
+ if not start_ch in '"\'':
+ return False
+ start_idx = self.text_idx
+ self.text_idx += starter + 1
+ escaped = False
+ while self.text_idx < len(self.text):
+ if escaped:
+ escaped = False
+ else:
+ if self.text[self.text_idx] == '\\':
+ escaped = True
+ elif self.text[self.text_idx] == start_ch:
+ self.text_idx += 1
+ break
+ self.text_idx += 1
+
+ self.tokens.append((self.text[start_idx : self.text_idx], 4))
+ return True
+
+ # Checks for punctuators
+ # Returns whether a punctuator was consumed (True or False)
+ def parse_punctuator(self):
+ tab_idx = 0
+ punc_len = 0
+ saved_punc = None
+ while 1:
+ pte = punc_table[tab_idx]
+ if pte[0] == self.text[self.text_idx]:
+ if pte[3] is not None:
+ saved_punc = pte[3]
+ self.text_idx += 1
+ tab_idx = pte[2]
+ if tab_idx == 0:
+ break
+ elif pte[1] == 0:
+ break
+ else:
+ tab_idx += 1
+ if saved_punc is not None:
+ self.tokens.append((saved_punc, 1))
+ return True
+ return False
+
+ def parse_number(self):
+ # A number must start with a digit or a dot followed by a digit
+ ch = self.text[self.text_idx]
+ if not ch.isdigit() and (ch != '.' or not self.text[self.text_idx + 1].isdigit()):
+ return False
+ token_type = 2 # integer
+ if ch == '.':
+ token_type = 3 # float
+ did_hex = False
+ start_idx = self.text_idx
+
+ # Check for Hex, Octal, or Binary
+ # Note that only D and Pawn support binary, but who cares?
+ #
+ if ch == '0':
+ self.text_idx += 1
+ ch = self.text[self.text_idx].upper()
+ if ch == 'X': # hex
+ did_hex = True
+ self.text_idx += 1
+ while self.text[self.text_idx] in '_0123456789abcdefABCDEF':
+ self.text_idx += 1
+ elif ch == 'B': # binary
+ self.text_idx += 1
+ while self.text[self.text_idx] in '_01':
+ self.text_idx += 1
+ elif ch >= '0' and ch <= 7: # octal (but allow decimal)
+ self.text_idx += 1
+ while self.text[self.text_idx] in '_0123456789':
+ self.text_idx += 1
+ else:
+ # either just 0 or 0.1 or 0UL, etc
+ pass
+ else:
+ # Regular int or float
+ while self.text[self.text_idx] in '_0123456789':
+ self.text_idx += 1
+
+ # Check if we stopped on a decimal point
+ if self.text[self.text_idx] == '.':
+ self.text_idx += 1
+ token_type = 3 # float
+ if did_hex:
+ while self.text[self.text_idx] in '_0123456789abcdefABCDEF':
+ self.text_idx += 1
+ else:
+ while self.text[self.text_idx] in '_0123456789':
+ self.text_idx += 1
+
+ # Check exponent
+ # Valid exponents per language (not that it matters):
+ # C/C++/D/Java: eEpP
+ # C#/Pawn: eE
+ if self.text[self.text_idx] in 'eEpP':
+ token_type = 3 # float
+ self.text_idx += 1
+ if self.text[self.text_idx] in '+-':
+ self.text_idx += 1
+ while self.text[self.text_idx] in '_0123456789':
+ self.text_idx += 1
+
+ # Check the suffixes
+ # Valid suffixes per language (not that it matters):
+ # Integer Float
+ # C/C++: uUlL lLfF
+ # C#: uUlL fFdDMm
+ # D: uUL ifFL
+ # Java: lL fFdD
+ # Pawn: (none) (none)
+ #
+ # Note that i, f, d, and m only appear in floats.
+ while 1:
+ if self.text[self.text_idx] in 'tTfFdDmM':
+ token_type = 3 # float
+ elif not self.text[self.text_idx] in 'lLuU':
+ break
+ self.text_idx += 1
+
+ self.tokens.append((self.text[start_idx : self.text_idx], token_type))
+ return True
+
+text = """
+1.23+4-3*16%2 *sin(1.e-3 + .5p32) "hello" and "hello\\"there"
+123 // some comment
+a = b + c;
+#define abc \\
+ 5
+d = 5 /* hello */ + 3;
+"""
+
+t = Tokenizer()
+t.tokenize_text(text)
+print(t.tokens)
+