in ext/gtest-1.8.0/googlemock/scripts/generator/cpp/tokenize.py [0:0]
def GetTokens(source):
"""Returns a sequence of Tokens.
Args:
source: string of C++ source code.
Yields:
Token that represents the next token in the source.
"""
# Cache various valid character sets for speed.
valid_identifier_chars = VALID_IDENTIFIER_CHARS
hex_digits = HEX_DIGITS
int_or_float_digits = INT_OR_FLOAT_DIGITS
int_or_float_digits2 = int_or_float_digits | set('.')
# Only ignore errors while in a #if 0 block.
ignore_errors = False
count_ifs = 0
i = 0
end = len(source)
while i < end:
# Skip whitespace.
while i < end and source[i].isspace():
i += 1
if i >= end:
return
token_type = UNKNOWN
start = i
c = source[i]
if c.isalpha() or c == '_': # Find a string token.
token_type = NAME
while source[i] in valid_identifier_chars:
i += 1
# String and character constants can look like a name if
# they are something like L"".
if (source[i] == "'" and (i - start) == 1 and
source[start:i] in 'uUL'):
# u, U, and L are valid C++0x character preffixes.
token_type = CONSTANT
i = _GetChar(source, start, i)
elif source[i] == "'" and source[start:i] in _STR_PREFIXES:
token_type = CONSTANT
i = _GetString(source, start, i)
elif c == '/' and source[i+1] == '/': # Find // comments.
i = source.find('\n', i)
if i == -1: # Handle EOF.
i = end
continue
elif c == '/' and source[i+1] == '*': # Find /* comments. */
i = source.find('*/', i) + 2
continue
elif c in ':+-<>&|*=': # : or :: (plus other chars).
token_type = SYNTAX
i += 1
new_ch = source[i]
if new_ch == c and c != '>': # Treat ">>" as two tokens.
i += 1
elif c == '-' and new_ch == '>':
i += 1
elif new_ch == '=':
i += 1
elif c in '()[]{}~!?^%;/.,': # Handle single char tokens.
token_type = SYNTAX
i += 1
if c == '.' and source[i].isdigit():
token_type = CONSTANT
i += 1
while source[i] in int_or_float_digits:
i += 1
# Handle float suffixes.
for suffix in ('l', 'f'):
if suffix == source[i:i+1].lower():
i += 1
break
elif c.isdigit(): # Find integer.
token_type = CONSTANT
if c == '0' and source[i+1] in 'xX':
# Handle hex digits.
i += 2
while source[i] in hex_digits:
i += 1
else:
while source[i] in int_or_float_digits2:
i += 1
# Handle integer (and float) suffixes.
for suffix in ('ull', 'll', 'ul', 'l', 'f', 'u'):
size = len(suffix)
if suffix == source[i:i+size].lower():
i += size
break
elif c == '"': # Find string.
token_type = CONSTANT
i = _GetString(source, start, i)
elif c == "'": # Find char.
token_type = CONSTANT
i = _GetChar(source, start, i)
elif c == '#': # Find pre-processor command.
token_type = PREPROCESSOR
got_if = source[i:i+3] == '#if' and source[i+3:i+4].isspace()
if got_if:
count_ifs += 1
elif source[i:i+6] == '#endif':
count_ifs -= 1
if count_ifs == 0:
ignore_errors = False
# TODO(nnorwitz): handle preprocessor statements (\ continuations).
while 1:
i1 = source.find('\n', i)
i2 = source.find('//', i)
i3 = source.find('/*', i)
i4 = source.find('"', i)
# NOTE(nnorwitz): doesn't handle comments in #define macros.
# Get the first important symbol (newline, comment, EOF/end).
i = min([x for x in (i1, i2, i3, i4, end) if x != -1])
# Handle #include "dir//foo.h" properly.
if source[i] == '"':
i = source.find('"', i+1) + 1
assert i > 0
continue
# Keep going if end of the line and the line ends with \.
if not (i == i1 and source[i-1] == '\\'):
if got_if:
condition = source[start+4:i].lstrip()
if (condition.startswith('0') or
condition.startswith('(0)')):
ignore_errors = True
break
i += 1
elif c == '\\': # Handle \ in code.
# This is different from the pre-processor \ handling.
i += 1
continue
elif ignore_errors:
# The tokenizer seems to be in pretty good shape. This
# raise is conditionally disabled so that bogus code
# in an #if 0 block can be handled. Since we will ignore
# it anyways, this is probably fine. So disable the
# exception and return the bogus char.
i += 1
else:
sys.stderr.write('Got invalid token in %s @ %d token:%s: %r\n' %
('?', i, c, source[i-10:i+10]))
raise RuntimeError('unexpected token')
if i <= 0:
print('Invalid index, exiting now.')
return
yield Token(token_type, source[start:i], start, i)