python – 解析pcfg语法树 提取其语法规则 Probabilistic Context-Free Grammar Parser

http://www.voidcn.com/article/p-qnsvatou-byu.html

 

string = '''
    (ROOT
        (S
            (NP (NN Carnac) (DT the) (NN Magnificent))
            (VP (VBD gave) (NP (DT a) (NN talk)))
        )
    )
'''

def is_symbol_char(character):
    '''
    Predicate to test if a character is valid
    for use in a symbol, extend as needed.
    '''

    return character.isalpha() or character in '-=$!?.'

def tokenize(characters):
    '''
    Process characters into a nested structure.  The original string
    '(DT the)' is passed in as ['(', 'D', 'T', ' ', 't', 'h', 'e', ')']
    '''

    tokens = []

    while characters:
        character = characters.pop(0)

        if character.isspace():
            pass  # nothing to do, ignore it

        elif character == '(':  # signals start of recursive analysis (push)
            characters, result = tokenize(characters)
            tokens.append(result)

        elif character == ')':  # signals end of recursive analysis (pop)
            break

        elif is_symbol_char(character):
            # if it looks like a symbol, collect all
            # subsequents symbol characters
            symbol = ''

            while is_symbol_char(character):
                symbol += character
                character = characters.pop(0)

            # push unused non-symbol character back onto characters
            characters.insert(0, character)

            tokens.append(symbol)

    # Return whatever tokens we collected and any characters left over
    return characters, tokens

def extract_rules(tokens):
    ''' Recursively walk tokenized data extracting rules. '''

    head, *tail = tokens

    print(head, '-->', *[x[0] if isinstance(x, list) else x for x in tail])

    for token in tail:  # recurse
        if isinstance(token, list):
            extract_rules(token)

characters, tokens = tokenize(list(string))

# After a successful tokenization, all the characters should be consumed
assert not characters, "Didn't consume all the input!"

print('Tokens:', tokens[0], 'Rules:', sep='\n\n', end='\n\n')

extract_rules(tokens[0])

 

https://github.com/usami/pcfg

posted @ 2020-11-01 00:57  cup_leo  阅读(460)  评论(0编辑  收藏  举报