提取mongodb中论文的信息,填入mysql,加快统计速度

1.创建mysql的alzheimer表,包括pmc_id,journal,title,abstract,name,authorinfor,pun_year,keyword,reference信息。

#encoding = utf-8
import pymysql
import json

def input_from_json(filename):
    with open(filename,'r') as file:
        data = json.loads(file.read())
        return data
conn= pymysql.connect(
        host='localhost',
        port = 3306,
        user='root',
        passwd='',
        db='python'
        )
cur = conn.cursor()

def createdb():
    cur.execute('create table alzheimer(id int AUTO_INCREMENT,PRIMARY KEY (id))')
    cur.execute('alter table alzheimer add pmc_id int')
    cur.execute('alter table alzheimer add journal text')
    cur.execute('alter table alzheimer add title text')
    cur.execute('alter table alzheimer add abstract text')
    cur.execute('alter table alzheimer add namestr text')
    cur.execute('alter table alzheimer add authorinfor text')
    cur.execute('alter table alzheimer add pub_year varchar(5)')
    cur.execute('alter table alzheimer add union_kwd_str text')
    cur.execute('alter table alzheimer add reference_str text')

createdb()

#up is all the test
def addnewcloumn():
    cur.execute('alter table test add transactor varchar(10)')
    cur.execute('alter table ad add hasid varchar(10)')

def addtomysql():
    idlist = input_from_json('id_list.json')['idList']
    values = []
    for i in range(len(idlist)):
        values.append((i,idlist[i]))

    cur.executemany('insert into ad  values(%s,%s)',values)


def updatenewcol():
    idlist = input_from_json('id_list.json')['idList']
    values = []
    for i in range(len(idlist)):
        values.append(('yes',i))
    cur.executemany('update ad set hasid = (%s) where id = (%s)',values)

def selectpmcid():
    sql = "SELECT pmc_id FROM ad "
    a = cur.execute(sql)
    print a
    b = cur.fetchmany(a)  #b has 7887 abstract list

    print b[0][0],b[2][0]
    print type(b[0][0])   #int

def addnewcolunm():
    cur.execute('alter table ad add journal text')
    cur.execute('alter table ad add title text')
    cur.execute('alter table ad add abstract text')
    cur.execute('alter table ad add namestr text')
    cur.execute('alter table ad add authorinfor text')
    cur.execute('alter table ad add pub_year varchar(5)')
    cur.execute('alter table ad add union_kwd_str text')
    cur.execute('alter table ad add reference_str text')

def inserttest():
    cur.execute('create table test2(id int AUTO_INCREMENT,PRIMARY KEY (id))')
    cur.execute('alter table test2 add pmc_id int')
    cur.execute('alter table test2 add title text')
def inserttest2():
    values = []
    for i in range(10):
        values.append((i,'hello'+str(i)))
    cur.executemany('insert into test2(pmc_id,title) values(%s,%s)',values)

   
conn.commit()   
cur.close()
conn.close()

2.从mongodb中获取信息并且填入mysql,因为有12万条,中间可能有不正确的数据无法填入,分段插入。实际结果表明,insert语句比update语句快了将近十倍不止。最终选择新建一个完全为空的表,采用insert进行插入。

问题:引用的信息提取的有问题。接下来要进行的工作。

from pymongo import MongoClient
from bs4 import BeautifulSoup
from nltk.tokenize import MWETokenizer
import re
import pymysql
import time

# create link
client = MongoClient('localhost', 27017)
db = client['ad']   #or db = client.ad
collection = db['xml']   #create cousin

mongodb_to_mysql_list = []
num = 0
#first [:27810]
#second [27811:60000]
#third [60000:100000]   100000 cant 
#[100001:] 9859 cant  109861cant
#[100001:109860]
begin = time.time()
for item in collection.find()[109861:]:
    pmc_xml = item['xml']
    pmc_id = item['pmc_id'].encode('utf-8')
    pmc_id = int(pmc_id)
    soup = BeautifulSoup(pmc_xml,"lxml")  #if not add "lxml" will warning but not error

    print num
    #find the journal
    journal = soup.find('journal-title')
    if journal != None:
        journal=journal.text.encode('utf-8')
    else:
        journal = ''
    
    #find the article
    title = soup.find('article-title')
    if title != None:
        title = title.text.encode('utf-8')
    else:
        title = ''

    #show the author
    authornamelist = soup.find_all('contrib')
    namestr = ''
    for x in authornamelist:
        if x.surname != None:
            name = x.surname.text
            if x.surname.next_sibling != None:
                if x.surname.next_sibling.next_sibling != None:
                    name = name +' '+ x.surname.next_sibling.next_sibling.text
        namestr = namestr+name.encode('utf-8')+','

    #show the firt author information
    authorinfor = ''
    authorinfor = soup.find('aff')
    if authorinfor != None:
        authorinfor =authorinfor.text.encode('utf-8')
    else:
        authorinfor = ''

    #show the receive time year
    pub_year = soup.find('year')
    if pub_year != None:
        pub_year=pub_year.text.encode('utf-8')
    else:
        pub_year=''
    
    #show the abstract
    a = soup.find('abstract')
    if a != None:
        if a.p != None:
            abstract = a.p.text.encode('utf-8')
        else:
            abstract = ''

    #show the key-words
    kwdlist = soup.find_all('kwd')
    union_kwd_str = ''
    for x in kwdlist:
        kwd = x.text.lower().encode('utf-8')
        kwdstr = re.sub("\"|,|\.", "", kwd)
        kwd = tuple(kwdstr.split())
        tokenizer = MWETokenizer([kwd])
        union_kwd = tokenizer.tokenize(kwdstr.split())
        if union_kwd != []:
            union_kwd_str = union_kwd_str+union_kwd[0]+','

    #show the reference id
    pub_id_list = soup.find_all('pub-id')
    reference_idlist = []
    reference_str = ''
    for x in pub_id_list:
        if x != None:
            reference = x.text.encode('utf-8')
            reference_idlist.append(reference)
        reference_str = reference_str+reference+','

    mongodb_to_mysql_list.append((pmc_id,journal,title,abstract,namestr,authorinfor,pub_year,union_kwd_str,reference_str))

    num += 1

print num
end1 = time.time() -begin
print end1


conn= pymysql.connect(
        host='localhost',
        port = 3306,
        user='root',
        passwd='',
        db='python'
        )
cur = conn.cursor()
#cur.executemany('update ad set journal=(%s),title=(%s),abstract=(%s),namestr=(%s),authorinfor=(%s),pub_year=(%s),union_kwd_str=(%s),reference_str=(%s) where pmc_id = (%s)',mongodb_to_mysql_list)
#99s 100
cur.executemany('insert into alzheimer(pmc_id,journal,title,abstract,namestr,authorinfor,pub_year,union_kwd_str,reference_str) values(%s,%s,%s,%s,%s,%s,%s,%s,%s)',mongodb_to_mysql_list)
#8.5s 100

#cur.executemany('insert into test2(pmc_id,title) values(%s,%s)',values)
conn.commit()   
cur.close()
conn.close()

end2 = time.time() -begin
print end2

 

posted @ 2016-12-14 12:55  zdmlcmepl  阅读(382)  评论(0编辑  收藏  举报