SQL里提取依赖表-py

一个路径可能会产出多个表，1->n
输入：带sql的文件绝对路径，一行一个路径
输出：文件名\t依赖的表*

# coding=utf-8
import re

def getTable(shName,linesql):
    search_index = 0
    while len(linesql) > 12 and '_' in linesql and ('from ' in linesql or 'join ' in linesql):
        # 从第一个from或join截取
        if linesql.find('from ') >= 0 and linesql.find('join ') >= 0:
            from_index = linesql.index('from ')
            join_index = linesql.index('join ')
            search_index = join_index if join_index < from_index else from_index
        elif linesql.find('from ') >= 0:
            search_index = linesql.index('from ')
        elif linesql.find('join ') >= 0:
            search_index = linesql.index('join ')
        else:
            return
        # print(search_index)
        linesql = linesql[search_index:]
        # 判断是否空格后是表
        pattern = re.compile(r'(?:from|join)\s+(\S+\_*)\s*')
        linesqlli = pattern.findall(linesql)
        for table in linesqlli:
            if table.find('(') == -1 and table.find('_') >=0:
                if table.find('.') == -1:
                    table = 'portal.'+table
                # print(table.lower())
                with open(r'D:\MyDesk\已迁移的文件依赖.txt', encoding='utf-8', mode='a') as rf:
                    rf.write(shName+'\t'+table.lower()+'\n')
            else:continue
        linesql = linesql[linesql.find(table)+len(table)+1:]
        rf.close()

def catFile(filePath):
    linestr = ''
    # 读取每一行 合为一行
    break_pattern = r'(\s|\n|\r|\t)+'
    sh_name = 'portal.'+re.sub(r'D:\\GitProjects\\offlineexperiment\\dev\\hive.*\\sh_','',filePath).replace('.sh','').lower()
    with open(filePath, encoding='utf-8', mode='r') as sf:
        for line in sf:
            line = re.sub(break_pattern,' ',line.replace('\n', ' '))
            linestr += line
    getTable(sh_name,linestr)


if __name__ == '__main__':
    # 准备文件路径列表
    sh_list = []
    with open(r'D:\MyDesk\已迁移的文件.txt', encoding='utf-8', mode='r') as path_file:
        for path in path_file:
            sh_list.append(path.replace('\n', ''))
    path_file.close()
    for sh in sh_list:
        catFile(sh)