286 lines
9.4 KiB
Python
286 lines
9.4 KiB
Python
#!/usr/bin/env python
|
|
#
|
|
# Copyright 2007 Neal Norwitz
|
|
# Portions Copyright 2007 Google Inc.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
"""Tokenize C++ source code."""
|
|
|
|
__author__ = "nnorwitz@google.com (Neal Norwitz)"
|
|
|
|
|
|
try:
|
|
# Python 3.x
|
|
import builtins
|
|
except ImportError:
|
|
# Python 2.x
|
|
import __builtin__ as builtins
|
|
|
|
import sys
|
|
|
|
from cpp import utils
|
|
|
|
if not hasattr(builtins, "set"):
|
|
# Nominal support for Python 2.3.
|
|
from sets import Set as set
|
|
|
|
|
|
# Add $ as a valid identifier char since so much code uses it.
|
|
_letters = "abcdefghijklmnopqrstuvwxyz"
|
|
VALID_IDENTIFIER_CHARS = set(_letters + _letters.upper() + "_0123456789$")
|
|
HEX_DIGITS = set("0123456789abcdefABCDEF")
|
|
INT_OR_FLOAT_DIGITS = set("01234567890eE-+")
|
|
|
|
|
|
# C++0x string preffixes.
|
|
_STR_PREFIXES = set(("R", "u8", "u8R", "u", "uR", "U", "UR", "L", "LR"))
|
|
|
|
|
|
# Token types.
|
|
UNKNOWN = "UNKNOWN"
|
|
SYNTAX = "SYNTAX"
|
|
CONSTANT = "CONSTANT"
|
|
NAME = "NAME"
|
|
PREPROCESSOR = "PREPROCESSOR"
|
|
|
|
# Where the token originated from. This can be used for backtracking.
|
|
# It is always set to WHENCE_STREAM in this code.
|
|
WHENCE_STREAM, WHENCE_QUEUE = range(2)
|
|
|
|
|
|
class Token(object):
|
|
"""Data container to represent a C++ token.
|
|
|
|
Tokens can be identifiers, syntax char(s), constants, or
|
|
pre-processor directives.
|
|
|
|
start contains the index of the first char of the token in the source
|
|
end contains the index of the last char of the token in the source
|
|
"""
|
|
|
|
def __init__(self, token_type, name, start, end):
|
|
self.token_type = token_type
|
|
self.name = name
|
|
self.start = start
|
|
self.end = end
|
|
self.whence = WHENCE_STREAM
|
|
|
|
def __str__(self):
|
|
if not utils.DEBUG:
|
|
return "Token(%r)" % self.name
|
|
return "Token(%r, %s, %s)" % (self.name, self.start, self.end)
|
|
|
|
__repr__ = __str__
|
|
|
|
|
|
def _GetString(source, start, i):
|
|
i = source.find('"', i + 1)
|
|
while source[i - 1] == "\\":
|
|
# Count the trailing backslashes.
|
|
backslash_count = 1
|
|
j = i - 2
|
|
while source[j] == "\\":
|
|
backslash_count += 1
|
|
j -= 1
|
|
# When trailing backslashes are even, they escape each other.
|
|
if (backslash_count % 2) == 0:
|
|
break
|
|
i = source.find('"', i + 1)
|
|
return i + 1
|
|
|
|
|
|
def _GetChar(source, start, i):
|
|
# NOTE(nnorwitz): may not be quite correct, should be good enough.
|
|
i = source.find("'", i + 1)
|
|
while source[i - 1] == "\\":
|
|
# Need to special case '\\'.
|
|
if (i - 2) > start and source[i - 2] == "\\":
|
|
break
|
|
i = source.find("'", i + 1)
|
|
# Try to handle unterminated single quotes (in a #if 0 block).
|
|
if i < 0:
|
|
i = start
|
|
return i + 1
|
|
|
|
|
|
def GetTokens(source):
|
|
"""Returns a sequence of Tokens.
|
|
|
|
Args:
|
|
source: string of C++ source code.
|
|
|
|
Yields:
|
|
Token that represents the next token in the source.
|
|
"""
|
|
# Cache various valid character sets for speed.
|
|
valid_identifier_chars = VALID_IDENTIFIER_CHARS
|
|
hex_digits = HEX_DIGITS
|
|
int_or_float_digits = INT_OR_FLOAT_DIGITS
|
|
int_or_float_digits2 = int_or_float_digits | set(".")
|
|
|
|
# Only ignore errors while in a #if 0 block.
|
|
ignore_errors = False
|
|
count_ifs = 0
|
|
|
|
i = 0
|
|
end = len(source)
|
|
while i < end:
|
|
# Skip whitespace.
|
|
while i < end and source[i].isspace():
|
|
i += 1
|
|
if i >= end:
|
|
return
|
|
|
|
token_type = UNKNOWN
|
|
start = i
|
|
c = source[i]
|
|
if c.isalpha() or c == "_": # Find a string token.
|
|
token_type = NAME
|
|
while source[i] in valid_identifier_chars:
|
|
i += 1
|
|
# String and character constants can look like a name if
|
|
# they are something like L"".
|
|
if source[i] == "'" and (i - start) == 1 and source[start:i] in "uUL":
|
|
# u, U, and L are valid C++0x character preffixes.
|
|
token_type = CONSTANT
|
|
i = _GetChar(source, start, i)
|
|
elif source[i] == "'" and source[start:i] in _STR_PREFIXES:
|
|
token_type = CONSTANT
|
|
i = _GetString(source, start, i)
|
|
elif c == "/" and source[i + 1] == "/": # Find // comments.
|
|
i = source.find("\n", i)
|
|
if i == -1: # Handle EOF.
|
|
i = end
|
|
continue
|
|
elif c == "/" and source[i + 1] == "*": # Find /* comments. */
|
|
i = source.find("*/", i) + 2
|
|
continue
|
|
elif c in ":+-<>&|*=": # : or :: (plus other chars).
|
|
token_type = SYNTAX
|
|
i += 1
|
|
new_ch = source[i]
|
|
if new_ch == c and c != ">": # Treat ">>" as two tokens.
|
|
i += 1
|
|
elif c == "-" and new_ch == ">":
|
|
i += 1
|
|
elif new_ch == "=":
|
|
i += 1
|
|
elif c in "()[]{}~!?^%;/.,": # Handle single char tokens.
|
|
token_type = SYNTAX
|
|
i += 1
|
|
if c == "." and source[i].isdigit():
|
|
token_type = CONSTANT
|
|
i += 1
|
|
while source[i] in int_or_float_digits:
|
|
i += 1
|
|
# Handle float suffixes.
|
|
for suffix in ("l", "f"):
|
|
if suffix == source[i : i + 1].lower():
|
|
i += 1
|
|
break
|
|
elif c.isdigit(): # Find integer.
|
|
token_type = CONSTANT
|
|
if c == "0" and source[i + 1] in "xX":
|
|
# Handle hex digits.
|
|
i += 2
|
|
while source[i] in hex_digits:
|
|
i += 1
|
|
else:
|
|
while source[i] in int_or_float_digits2:
|
|
i += 1
|
|
# Handle integer (and float) suffixes.
|
|
for suffix in ("ull", "ll", "ul", "l", "f", "u"):
|
|
size = len(suffix)
|
|
if suffix == source[i : i + size].lower():
|
|
i += size
|
|
break
|
|
elif c == '"': # Find string.
|
|
token_type = CONSTANT
|
|
i = _GetString(source, start, i)
|
|
elif c == "'": # Find char.
|
|
token_type = CONSTANT
|
|
i = _GetChar(source, start, i)
|
|
elif c == "#": # Find pre-processor command.
|
|
token_type = PREPROCESSOR
|
|
got_if = source[i : i + 3] == "#if" and source[i + 3 : i + 4].isspace()
|
|
if got_if:
|
|
count_ifs += 1
|
|
elif source[i : i + 6] == "#endif":
|
|
count_ifs -= 1
|
|
if count_ifs == 0:
|
|
ignore_errors = False
|
|
|
|
# TODO(nnorwitz): handle preprocessor statements (\ continuations).
|
|
while 1:
|
|
i1 = source.find("\n", i)
|
|
i2 = source.find("//", i)
|
|
i3 = source.find("/*", i)
|
|
i4 = source.find('"', i)
|
|
# NOTE(nnorwitz): doesn't handle comments in #define macros.
|
|
# Get the first important symbol (newline, comment, EOF/end).
|
|
i = min([x for x in (i1, i2, i3, i4, end) if x != -1])
|
|
|
|
# Handle #include "dir//foo.h" properly.
|
|
if source[i] == '"':
|
|
i = source.find('"', i + 1) + 1
|
|
assert i > 0
|
|
continue
|
|
# Keep going if end of the line and the line ends with \.
|
|
if not (i == i1 and source[i - 1] == "\\"):
|
|
if got_if:
|
|
condition = source[start + 4 : i].lstrip()
|
|
if condition.startswith("0") or condition.startswith("(0)"):
|
|
ignore_errors = True
|
|
break
|
|
i += 1
|
|
elif c == "\\": # Handle \ in code.
|
|
# This is different from the pre-processor \ handling.
|
|
i += 1
|
|
continue
|
|
elif ignore_errors:
|
|
# The tokenizer seems to be in pretty good shape. This
|
|
# raise is conditionally disabled so that bogus code
|
|
# in an #if 0 block can be handled. Since we will ignore
|
|
# it anyways, this is probably fine. So disable the
|
|
# exception and return the bogus char.
|
|
i += 1
|
|
else:
|
|
sys.stderr.write(
|
|
"Got invalid token in %s @ %d token:%s: %r\n"
|
|
% ("?", i, c, source[i - 10 : i + 10])
|
|
)
|
|
raise RuntimeError("unexpected token")
|
|
|
|
if i <= 0:
|
|
print("Invalid index, exiting now.")
|
|
return
|
|
yield Token(token_type, source[start:i], start, i)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
def main(argv):
|
|
"""Driver mostly for testing purposes."""
|
|
for filename in argv[1:]:
|
|
source = utils.ReadFile(filename)
|
|
if source is None:
|
|
continue
|
|
|
|
for token in GetTokens(source):
|
|
print("%-12s: %s" % (token.token_type, token.name))
|
|
# print('\r%6.2f%%' % (100.0 * index / token.end),)
|
|
sys.stdout.write("\n")
|
|
|
|
main(sys.argv)
|