Python-从邮件中提取内容并插入数据库

引子

有100多封eml格式的本地邮件，需要从每封邮件中提取出特定的内容，并插入数据库中。难处在于如何使用正则提取内容。想试验正则表达式的结果，可以下载RegexBuddy，方便调试。

代码

import os
from email import message_from_file
import re
import hashlib
import json
import pymssql

def parseEmail(file):
    with open(file) as f:
        name = f.name
        msg = message_from_file(f)
        target_text = msg.get_payload()[0].get_payload()
    # <span>Timestamp: 2020-07-30 00:00:00 (UTC) </span>
    # <strong>Koality_AutoDQ4_OnePipeline_OfficeForms</strong>
    timestamp = re.findall("Timestamp: (\d{4}-\d{1,2}-\d{1,2})",target_text)[0]
    dataset = re.findall("Dataset: .+_.+_.+_(.+_*\w+) <http",target_text)[0]
    unique_dataset_name = re.findall("Dataset: (.+_.+_.+_.+) <http",target_text)[0]
    metrics = re.findall(" Metrics[:=]\n*(.+\n*.+)\n*<=*\n*h", target_text)[0]
    metrics = metrics.replace('\n', '').replace('=','').replace(': ', '')

    # Koality_AutoDQ4_OA-OXO_Teams_DoD, whose dataset is  Teams_DoD instaed of DoD
    if 'Teams_' in unique_dataset_name:
        dataset = 'Teams_' + dataset


    incident_source_text = target_text[target_text.find('Incident List'): target_text.find('RootCause')].replace('=', '').replace('\n', '')
    incidents_str = incident_source_text[:incident_source_text.find('<https')]
    incdient_groups = re.findall("(\w+:\w+),", incidents_str)
    incident_dict = {}
    for incident in incdient_groups:
        k, v = incident.split(':')
        incident_dict[k] = v
    incident_json = json.dumps(incident_dict)
    hashkey = toHash(incident_json)
    return timestamp, dataset, metrics, incident_json, hashkey, unique_dataset_name

def toHash(text):
    hs = hashlib.md5()
    hs.update(text.encode())
    return hs.hexdigest().upper()


def insertIntoDB(conn,table, results):
    cur = conn.cursor()
    count = 0
    for row in results:
        rowDict = {'table': table,
                   'timestamp':row[0], 
                   'dataset':row[1],
                   'metric':row[2], 
                   'json':row[3],
                   'hashkey':row[4],
                   'unique_dataset_name': row[5]}
        sql = """
                insert into [{table}] 
                ([WorkloadName], [MetricName], [DimensionCombination_JsonForDb_Hash], [Timestamp], [DimensionCombination_JsonForDb], [UniqueDatasetName], [MetricValue], [ExpectedMetricValue])
                VALUES ('{dataset}', '{metric}','{hashkey}','{timestamp}', '{json}', '{unique_dataset_name}', 0, 0); 
        """.format(**rowDict)
        try:
            cur.execute(sql)
            print('Inserted a row......')
        except:
            print(row[1], row[5])
        count += 1


    print("Successfully inserted {:d} rows....".format(count))
    cur.close()
    conn.close()
    return True


def main():
    print("Starting script......")

    emailFolder = r"C:\Work\IncidentEmails"

    results = []
    for filePath in os.listdir(emailFolder):
        fullPath = emailFolder + "\\" + filePath
        timestamp, dataset, metrics, incident_json, hashkey, unique_dataset_name = parseEmail(fullPath)
        results.append([timestamp, dataset, metrics, incident_json, hashkey, unique_dataset_name])

    conn = pymssql.connect('server', 'usr@server', 'pwd', 'db', autocommit=True)
    insertIntoDB(conn, 'KenshoAutoDQAlert', results)

if __name__ == "__main__":
    main()

View Code

posted @ 2020-09-17 18:14 F君君阅读(476) 评论(0) 编辑收藏举报

刷新页面返回顶部

F君君

Python-从邮件中提取内容并插入数据库

引子

代码

公告