使用python脚本自动删除python代码中的注释和字符串中的内容

删除脚本

from enum import Enum


# 定义状态的枚举
class State(Enum):

    CODE = 0                        # 代码
    SLASH = 1                       # 斜杠
    NOTE_MULTILINE = 2              # 多行注释
    NOTE_SINGLELINE = 3             # 单行注释
    BACKSLASH = 4                   # 拆行注释
    CODE_CHAR = 5                   # 字符
    CHAR_ESCAPE_SEQUENCE = 6        # 字符中的转义字符
    CODE_STRING = 7                 # 字符串
    STRING_ESCAPE_SEQUENCE = 8      # 字符串中的转义字符
    PYTHON_NOTE_SINGLELINE = 9      # python代码中的单行注释
    PYTHON_NOTE_MULTILINE_1 = 10    # python代码中的多行注释''''''
    PYTHON_NOTE_MULTILINE_2 = 11    # python代码中的多行注释""""""

# 删除python代码中的注释和字符串中的内容
def delete_python_note_and_str(str):
    s = ""
    char_index = 0
    string_index = 0
    char_count = 0
    string_count = 0
    # 初始状态定义为代码
    state = State.CODE
    for i, c in enumerate(str):
        if state == State.CODE:
            if c == '#':
                state = State.PYTHON_NOTE_SINGLELINE
            elif c == '\'':
                state = State.CODE_CHAR
                char_index = i
                char_count += 1
            elif c == '\"':
                state = State.CODE_STRING
                string_index = i
                string_count += 1
            else:
                s += c
        elif state == State.PYTHON_NOTE_SINGLELINE:
            if c == '\n':
                s += "\r\n"
                state = State.CODE
            else:
                state = State.PYTHON_NOTE_SINGLELINE
        elif state == State.CODE_CHAR:
            if c == '\'':
                if i - char_index == 1:
                    char_count += 1
                    if char_count < 3:
                        state = State.CODE_CHAR
                        char_index += 1
                    elif char_count == 3:
                        state = State.PYTHON_NOTE_MULTILINE_1
                        char_index = 0
                        char_count = 0
                else:
                    state = State.CODE
            elif c == '\n':
                if char_count < 3:
                    state = State.CODE
                    s += "\r\n"
                elif char_count == 3:
                    state = State.PYTHON_NOTE_MULTILINE_1
                    char_index = 0
                    char_count = 0
            else:
                state = State.CODE_CHAR

        elif state == State.CODE_STRING:
            if c == '\"':
                if i - string_index == 1:
                    string_count += 1
                    if string_count < 3:
                        state = State.CODE_STRING
                        string_index += 1
                    elif string_count == 3:
                        state = State.PYTHON_NOTE_MULTILINE_2
                        string_index = 0
                        string_count = 0
                else:
                    state = State.CODE
            # elif c == '\'':
            #     state = State.CODE
            elif c == '\n':
                if string_count < 3:
                    state = State.CODE
                    s += '\r\n'
                elif string_count == 3:
                    state = State.PYTHON_NOTE_MULTILINE_2
                    string_index = 0
                    string_count = 0
            else:
                state = State.CODE_STRING

        elif state == State.PYTHON_NOTE_MULTILINE_1:
            if c == '\'':
                if char_index == 0:
                    char_index = i
                    char_count += 1
                    state = State.PYTHON_NOTE_MULTILINE_1
                else:
                    if (i - char_index) == 1:
                        char_count += 1
                        if char_count < 3:
                            char_index = i
                            state = State.PYTHON_NOTE_MULTILINE_1
                        elif char_count == 3:
                            char_index = 0
                            char_count = 0
                            state = State.CODE
                    else:
                        state = State.PYTHON_NOTE_MULTILINE_1

        elif state == State.PYTHON_NOTE_MULTILINE_2:
            if c == '\"':
                if string_index == 0:
                    string_index = i
                    string_count += 1
                    state = State.PYTHON_NOTE_MULTILINE_2
                else:
                    if (i - string_index) == 1:
                        string_count += 1
                        if string_count < 3:
                            string_index = i
                            state = State.PYTHON_NOTE_MULTILINE_2
                        elif string_count == 3:
                            string_index = 0
                            string_count = 0
                            state = State.CODE
                    else:
                        state = State.PYTHON_NOTE_MULTILINE_2

    return s

测试数据

import cStringIO, tokenize
def remove_comments_and_docstrings(source):
    """
    Returns 'source' minus comments and docstrings.
    """
    io_obj = cStringIO.StringIO(source)
    out = ""
    prev_toktype = tokenize.INDENT
    last_lineno = -1
    last_col = 0
    for tok in tokenize.generate_tokens(io_obj.readline):
        token_type = tok[0]
        token_string = tok[1]
        start_line, start_col = tok[2]
        end_line, end_col = tok[3]
        ltext = tok[4]
        # The following two conditionals preserve indentation.
        # This is necessary because we're not using tokenize.untokenize()
        # (because it spits out code with copious amounts of oddly-placed
        # whitespace).
        if start_line > last_lineno:
            last_col = 0
        if start_col > last_col:
            out += (" " * (start_col - last_col))
        # Remove comments:
        if token_type == tokenize.COMMENT:
            pass
        # This series of conditionals removes docstrings:
        elif token_type == tokenize.STRING:
            if prev_toktype != tokenize.INDENT:
        # This is likely a docstring; double-check we're not inside an operator:
                if prev_toktype != tokenize.NEWLINE:
                    # Note regarding NEWLINE vs NL: The tokenize module
                    # differentiates between newlines that start a new statement
                    # and newlines inside of operators such as parens, brackes,
                    # and curly braces.  Newlines inside of operators are
                    # NEWLINE and newlines that start new code are NL.
                    # Catch whole-module docstrings:
                    if start_col > 0:
                        # Unlabelled indentation means we're inside an operator
                        out += token_string
                    # Note regarding the INDENT token: The tokenize module does
                    # not label indentation inside of an operator (parens,
                    # brackets, and curly braces) as actual indentation.
                    # For example:
                    # def foo():
                    #     "The spaces before this docstring are tokenize.INDENT"
                    #     test = [
                    #         "The spaces before this string do not get a token"
                    #     ]
        else:
            out += token_string
        prev_toktype = token_type
        last_col = end_col
        last_lineno = end_line
    return out

删除之后的结果

import cStringIO, tokenize
def remove_comments_and_docstrings(source):
    
    io_obj = cStringIO.StringIO(source)
    out = 
    prev_toktype = tokenize.INDENT
    last_lineno = -1
    last_col = 0
    for tok in tokenize.generate_tokens(io_obj.readline):
        token_type = tok[0]
        token_string = tok[1]
        start_line, start_col = tok[2]
        end_line, end_col = tok[3]
        ltext = tok[4]
        
        
        
        
        if start_line > last_lineno:
            last_col = 0
        if start_col > last_col:
            out += ( * (start_col - last_col))
        
        if token_type == tokenize.COMMENT:
            pass
        
        elif token_type == tokenize.STRING:
            if prev_toktype != tokenize.INDENT:
        
                if prev_toktype != tokenize.NEWLINE:
                    
                    
                    
                    
                    
                    
                    if start_col > 0:
                        
                        out += token_string
                    
                    
                    
                    
                    
                    
                    
                    
                    
        else:
            out += token_string
        prev_toktype = token_type
        last_col = end_col
        last_lineno = end_line
    return out
posted @ 2021-09-14 14:46  凯尔哥  阅读(1006)  评论(0编辑  收藏  举报