删除脚本
from enum import Enum
# 定义状态的枚举
class State(Enum):
CODE = 0 # 代码
SLASH = 1 # 斜杠
NOTE_MULTILINE = 2 # 多行注释
NOTE_SINGLELINE = 3 # 单行注释
BACKSLASH = 4 # 拆行注释
CODE_CHAR = 5 # 字符
CHAR_ESCAPE_SEQUENCE = 6 # 字符中的转义字符
CODE_STRING = 7 # 字符串
STRING_ESCAPE_SEQUENCE = 8 # 字符串中的转义字符
PYTHON_NOTE_SINGLELINE = 9 # python代码中的单行注释
PYTHON_NOTE_MULTILINE_1 = 10 # python代码中的多行注释''''''
PYTHON_NOTE_MULTILINE_2 = 11 # python代码中的多行注释""""""
# 删除python代码中的注释和字符串中的内容
def delete_python_note_and_str(str):
s = ""
char_index = 0
string_index = 0
char_count = 0
string_count = 0
# 初始状态定义为代码
state = State.CODE
for i, c in enumerate(str):
if state == State.CODE:
if c == '#':
state = State.PYTHON_NOTE_SINGLELINE
elif c == '\'':
state = State.CODE_CHAR
char_index = i
char_count += 1
elif c == '\"':
state = State.CODE_STRING
string_index = i
string_count += 1
else:
s += c
elif state == State.PYTHON_NOTE_SINGLELINE:
if c == '\n':
s += "\r\n"
state = State.CODE
else:
state = State.PYTHON_NOTE_SINGLELINE
elif state == State.CODE_CHAR:
if c == '\'':
if i - char_index == 1:
char_count += 1
if char_count < 3:
state = State.CODE_CHAR
char_index += 1
elif char_count == 3:
state = State.PYTHON_NOTE_MULTILINE_1
char_index = 0
char_count = 0
else:
state = State.CODE
elif c == '\n':
if char_count < 3:
state = State.CODE
s += "\r\n"
elif char_count == 3:
state = State.PYTHON_NOTE_MULTILINE_1
char_index = 0
char_count = 0
else:
state = State.CODE_CHAR
elif state == State.CODE_STRING:
if c == '\"':
if i - string_index == 1:
string_count += 1
if string_count < 3:
state = State.CODE_STRING
string_index += 1
elif string_count == 3:
state = State.PYTHON_NOTE_MULTILINE_2
string_index = 0
string_count = 0
else:
state = State.CODE
# elif c == '\'':
# state = State.CODE
elif c == '\n':
if string_count < 3:
state = State.CODE
s += '\r\n'
elif string_count == 3:
state = State.PYTHON_NOTE_MULTILINE_2
string_index = 0
string_count = 0
else:
state = State.CODE_STRING
elif state == State.PYTHON_NOTE_MULTILINE_1:
if c == '\'':
if char_index == 0:
char_index = i
char_count += 1
state = State.PYTHON_NOTE_MULTILINE_1
else:
if (i - char_index) == 1:
char_count += 1
if char_count < 3:
char_index = i
state = State.PYTHON_NOTE_MULTILINE_1
elif char_count == 3:
char_index = 0
char_count = 0
state = State.CODE
else:
state = State.PYTHON_NOTE_MULTILINE_1
elif state == State.PYTHON_NOTE_MULTILINE_2:
if c == '\"':
if string_index == 0:
string_index = i
string_count += 1
state = State.PYTHON_NOTE_MULTILINE_2
else:
if (i - string_index) == 1:
string_count += 1
if string_count < 3:
string_index = i
state = State.PYTHON_NOTE_MULTILINE_2
elif string_count == 3:
string_index = 0
string_count = 0
state = State.CODE
else:
state = State.PYTHON_NOTE_MULTILINE_2
return s
测试数据
import cStringIO, tokenize
def remove_comments_and_docstrings(source):
"""
Returns 'source' minus comments and docstrings.
"""
io_obj = cStringIO.StringIO(source)
out = ""
prev_toktype = tokenize.INDENT
last_lineno = -1
last_col = 0
for tok in tokenize.generate_tokens(io_obj.readline):
token_type = tok[0]
token_string = tok[1]
start_line, start_col = tok[2]
end_line, end_col = tok[3]
ltext = tok[4]
# The following two conditionals preserve indentation.
# This is necessary because we're not using tokenize.untokenize()
# (because it spits out code with copious amounts of oddly-placed
# whitespace).
if start_line > last_lineno:
last_col = 0
if start_col > last_col:
out += (" " * (start_col - last_col))
# Remove comments:
if token_type == tokenize.COMMENT:
pass
# This series of conditionals removes docstrings:
elif token_type == tokenize.STRING:
if prev_toktype != tokenize.INDENT:
# This is likely a docstring; double-check we're not inside an operator:
if prev_toktype != tokenize.NEWLINE:
# Note regarding NEWLINE vs NL: The tokenize module
# differentiates between newlines that start a new statement
# and newlines inside of operators such as parens, brackes,
# and curly braces. Newlines inside of operators are
# NEWLINE and newlines that start new code are NL.
# Catch whole-module docstrings:
if start_col > 0:
# Unlabelled indentation means we're inside an operator
out += token_string
# Note regarding the INDENT token: The tokenize module does
# not label indentation inside of an operator (parens,
# brackets, and curly braces) as actual indentation.
# For example:
# def foo():
# "The spaces before this docstring are tokenize.INDENT"
# test = [
# "The spaces before this string do not get a token"
# ]
else:
out += token_string
prev_toktype = token_type
last_col = end_col
last_lineno = end_line
return out
删除之后的结果
import cStringIO, tokenize
def remove_comments_and_docstrings(source):
io_obj = cStringIO.StringIO(source)
out =
prev_toktype = tokenize.INDENT
last_lineno = -1
last_col = 0
for tok in tokenize.generate_tokens(io_obj.readline):
token_type = tok[0]
token_string = tok[1]
start_line, start_col = tok[2]
end_line, end_col = tok[3]
ltext = tok[4]
if start_line > last_lineno:
last_col = 0
if start_col > last_col:
out += ( * (start_col - last_col))
if token_type == tokenize.COMMENT:
pass
elif token_type == tokenize.STRING:
if prev_toktype != tokenize.INDENT:
if prev_toktype != tokenize.NEWLINE:
if start_col > 0:
out += token_string
else:
out += token_string
prev_toktype = token_type
last_col = end_col
last_lineno = end_line
return out