爬虫案例之Pubmed数据库下载
代码
# encoding=utf-8 import os, time, re import urllib.request import urllib.parse import ssl ssl._create_default_https_context = ssl._create_unverified_context retmax = 500 FAILURE = 0 SUCCESS = 1 startNum = 1 BASE = 'NARA' FILES= ['Losartan','Valsartan','Irbesartan','Eprosartan','Candesartan','Telmisartan','Olmesartan'] # BASE = 'Triptans' # FILES = ['Sumatriptan','Zolmitriptan', # 'Naratriptan','Rizatriptan','Almotriptan', # 'Frovatriptan','Eletriptan'] if not os.path.exists(BASE): os.mkdir(BASE) def lastline(fd): lastNum = 0 print(fd) if not os.path.isfile(fd): f_check = open(fd, 'w') f_check.close() f = open(fd, 'r') lines = f.readlines() f.close() if lines: lastNum = int(lines[-1].strip()) return lastNum def Download(drug, sleep_time, query_key, webenv, endNum): lastNum = lastline('{}/{}/checkpoint.txt'.format(BASE,drug)) if lastNum == 0: start = startNum - 1 else: start = lastNum + retmax # print(lastNum) f_append = open('{}/{}/checkpoint.txt'.format(BASE, drug), 'a') f_error = open('{}/{}/error.txt'.format(BASE, drug), 'a') try: for retstart in range(start, endNum, retmax): time.sleep(sleep_time) print('\tdownloading: %d - %d' % (retstart + 1, retstart + retmax)) urllib.request.urlretrieve('https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?' 'db=pubmed&query_key=%s&WebEnv=%s&retstart=%s&retmax=%s&retmode=xml' % ( query_key, webenv, retstart, retmax), '%s/%s/%d-%d.xml' % (BASE, drug, retstart + 1, retstart + retmax)) f_append.write('%d\n' % (retstart)) f_append.flush() except Exception as ex: print(ex) # print('\t\tbad connection!') raise Exception() return FAILURE finally: f_append.close() f_error.close() print('Downloading is done........................') return SUCCESS def Download_auto(fun, drug, query_key, webenv, endNum, sleep_time=5): while True: try: value = fun(drug, sleep_time, query_key, webenv, endNum) if value == SUCCESS: break except Exception as e: sleep_time += 5 print('prolong sleep time:', sleep_time) def main(drug): """主函数""" if not os.path.exists('{}/{}'.format(BASE,drug)): os.mkdir('{}/{}'.format(BASE,drug)) query = '%s[TIAB]+OR+%s[MH]' % (drug, drug) url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term={}&usehistory=y'.format(query) history = urllib.request.urlopen(url) content = history.read().decode() pattern = re.compile('<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>') s = pattern.search(content) count = s.group(1) query_key = s.group(2) webenv = s.group(3) print('total counts: %s' % count) endNum = int(count) print(endNum) Download_auto(Download,drug, query_key, webenv, endNum) if __name__ == '__main__': start = time.time() list(map(main, FILES)) print(time.time() - start)