提取奖励办数据中人员信息(自用)

2016年

#coding=utf-8
import re
from pymongo import MongoClient

client = MongoClient("localhost", 27017)
db = client["nosta"]
collection1 = db["nosta_2016"]
collection2 = db["2016_list"]
db.authenticate("zty","zty")

n = 0
for item in collection1.find({}, {"project_name":1, "project_content":1, "_id":0}):
    n += 1
    print n
    if item['project_content'].has_key(u'主要完成人'):
        ls = item['project_content'][u'主要完成人']
        if ls:
            for line in ls:
                # print line
                matchObj1 = re.search( ur'(姓名:.*?) .*', line)
                matchObj2 = re.search( ur'.* (行政职务:.*?) .*', line)
                matchObj3 = re.search( ur'.* (技术职称:.*?) .*', line)
                matchObj4 = re.search( ur'.* (工作单位:.*?) .*', line)
                matchObj5 = re.search( ur'.* (对本项目技术创造性贡献:.*?) .*', line)
                matchObj6 = re.search( ur'.* (对本项目主要学术贡献:.*?) .*', line)
                matchObj7 = re.search( ur'.* (曾获国家科技奖励情况:.*)', line)

                dc = {}
                dc['project_name'] = item['project_name']
                dc['name'] = matchObj1.group(1) if matchObj1 else ''
                dc['duty'] = matchObj2.group(1) if matchObj2 else ''
                dc['title'] = matchObj3.group(1) if matchObj3 else ''
                dc['unit'] = matchObj4.group(1) if matchObj4 else ''
                dc['contribution'] = matchObj5.group(1) if matchObj5 else ''
                if dc['contribution']=='':
                    dc['contribution'] = matchObj6.group(1) if matchObj6 else ''
                dc['award'] = matchObj7.group(1) if matchObj7 else ''
                # for k, v in dc.items():
                #   print k, v
                collection2.insert(dc)

 

2017、2018年

#coding=utf-8
import re
from pymongo import MongoClient

client = MongoClient("localhost", 27017)
db = client["nosta"]
collection1 = db["nosta_2017"]
collection2 = db["2017_list"]
db.authenticate("zty","zty")

n = 0
for item in collection1.find({}, {"project_name":1, "project_content":1, "_id":0}):

    n += 1
    print n

    if item['project_content'].has_key(u'主要完成人:'):
        choice = item['project_content'][u'主要完成人:']
        if choice == []:
            continue
        ls = choice.split(u'姓名:')[1:]
        for line in ls:
            line = line.replace(u'排名:',u' 排名:')
            line = line.replace(u'行政职务:',u' 行政职务:')
            line = line.replace(u'技术职称:',u' 技术职称:')
            line = line.replace(u'工作单位:',u' 工作单位:')
            line = line.replace(u'完成项目时所在单位:',u' 完成项目时所在单位:')
            line = line.replace(u'对本项目技术创造性贡献:',u' 对本项目技术创造性贡献:')
            line = line.replace(u'对本项目主要学术贡献:',u' 对本项目主要学术贡献:')
            line = line.replace(u'曾获国家科技奖励情况:',u' 曾获国家科技奖励情况:')
            line = u'姓名:' + line
            # print line
            matchObj1 = re.search( ur'(姓名:.*?) .*', line)
            matchObj2 = re.search( ur'.* (行政职务:.*?) .*', line)
            matchObj3 = re.search( ur'.* (技术职称:.*?) .*', line)
            matchObj4 = re.search( ur'.* (工作单位:.*?) .*', line)
            matchObj5 = re.search( ur'.* (对本项目技术创造性贡献:.*?) .*', line)
            matchObj6 = re.search( ur'.* (对本项目主要学术贡献:.*?) .*', line)
            matchObj7 = re.search( ur'.* (曾获国家科技奖励情况:.*)', line)

            dc = {}
            dc['project_name'] = item['project_name']
            dc['name'] = matchObj1.group(1) if matchObj1 else ''
            dc['duty'] = matchObj2.group(1) if matchObj2 else ''
            dc['title'] = matchObj3.group(1) if matchObj3 else ''
            dc['unit'] = matchObj4.group(1) if matchObj4 else ''
            dc['contribution'] = matchObj5.group(1) if matchObj5 else ''
            if dc['contribution']=='':
                dc['contribution'] = matchObj6.group(1) if matchObj6 else ''
            dc['award'] = matchObj7.group(1) if matchObj7 else ''
            # for k, v in dc.items():
            #   print k, v
            collection2.insert(dc)

 

posted @ 2018-08-07 17:38  右介  阅读(207)  评论(0编辑  收藏  举报