Python解析xml文档实战案例
xml文档
<?xml version="1.0" ?> <!DOCTYPE PubmedArticleSet PUBLIC "-//NLM//DTD PubMedArticle, 1st January 2019//EN" "https://dtd.nlm.nih.gov/ncbi/pubmed/out/pubmed_190101.dtd"> <PubmedArticleSet> <PubmedArticle> <MedlineCitation Status="MEDLINE" Owner="NLM"> <PMID Version="1">28901317</PMID> <DateCompleted> <Year>2018</Year> <Month>05</Month> <Day>10</Day> </DateCompleted> <DateRevised> <Year>2018</Year> <Month>12</Month> <Day>02</Day> </DateRevised> <Article PubModel="Print"> <Journal> <ISSN IssnType="Electronic">1998-4138</ISSN> <JournalIssue CitedMedium="Internet"> <Volume>13</Volume> <Issue>4</Issue> <PubDate> <Year>2017</Year> </PubDate> </JournalIssue> <Title>Journal of cancer research and therapeutics</Title> <ISOAbbreviation>J Cancer Res Ther</ISOAbbreviation> </Journal> <ArticleTitle><i>k-RAS</i> mutation and resistance to epidermal growth factor receptor-tyrosine kinase inhibitor treatment in patients with nonsmall cell lung cancer.</ArticleTitle> <Pagination> <MedlinePgn>699-701</MedlinePgn> </Pagination> <ELocationID EIdType="doi" ValidYN="Y">10.4103/jcrt.JCRT_468_17</ELocationID> <Abstract> <AbstractText Label="OBJECTIVE" NlmCategory="OBJECTIVE">The aim of this study was to evaluate the relationship between k-RAS gene mutation and the resistance to epidermal growth factor receptor-tyrosine kinase inhibitor (EGFR-TKI) treatment in patients with nonsmall-cell lung cancer (NSCLC).</AbstractText> <AbstractText Label="METHODS" NlmCategory="METHODS">Forty-five pathologies confirmed NSCLC patients who received EGFR-TKI (Gefitinib) treatment were retrospectively included in this study. The mutation of codon 12 and 13, located in exon1 and exon 2 of k-RAS gene were examined by polymerase chain reaction (PCR) and DAN sequencing in tumor samples of the included 45 NSCLC patients. The correlation between Gefitinib treatment response and k-RAS mutation status was analyzed in tumor samples of the 45 NSCLC patients.</AbstractText> <AbstractText Label="RESULTS" NlmCategory="RESULTS">Eight tumor samples of the 45 NSCLC patients were found to be mutated in coden 12 or 13, with an mutation rate of 17.8% (8/45); the objective response rate (ORR) was 29.7%(11/37) with 1 cases of complete response (CR) and 10 cases of partial response in k-RAS mutation negative patients. Furthermore, the ORR was 0.0% in k-RAS mutation positive patients with none CR. The ORR between k-RAS mutation and nonmutation patients were significant different (P < 0.05).</AbstractText> <AbstractText Label="CONCLUSION" NlmCategory="CONCLUSIONS">k-RAS gene mutation status was associated with the response of Gefitinib treatment in patients with NSCLC.</AbstractText> </Abstract> <AuthorList CompleteYN="Y"> <Author ValidYN="Y"> <LastName>Zhou</LastName> <ForeName>Bin</ForeName> <Initials>B</Initials> <AffiliationInfo> <Affiliation>Department of Pharmacy, The First Affiliated Hospital of Wenzhou Medical University, Wenzhou, Zhejiang, Province 325200, PR China.</Affiliation> </AffiliationInfo> </Author> <Author ValidYN="Y"> <LastName>Tang</LastName> <ForeName>Congrong</ForeName> <Initials>C</Initials> <AffiliationInfo> <Affiliation>Department of Pharmacy, The First Affiliated Hospital of Wenzhou Medical University, Wenzhou, Zhejiang, Province 325200, PR China.</Affiliation> </AffiliationInfo> </Author> <Author ValidYN="Y"> <LastName>Li</LastName> <ForeName>Jie</ForeName> <Initials>J</Initials> <AffiliationInfo> <Affiliation>Department of Pharmacy, Ruian People's Hospital, Ruian, Zhejiang, Province 325200, PR China.</Affiliation> </AffiliationInfo> </Author> </AuthorList> <Language>eng</Language> <PublicationTypeList> <PublicationType UI="D016428">Journal Article</PublicationType> </PublicationTypeList> </Article> <MedlineJournalInfo> <Country>India</Country> <MedlineTA>J Cancer Res Ther</MedlineTA> <NlmUniqueID>101249598</NlmUniqueID> <ISSNLinking>1998-4138</ISSNLinking> </MedlineJournalInfo> <ChemicalList> <Chemical> <RegistryNumber>0</RegistryNumber> <NameOfSubstance UI="C117307">KRAS protein, human</NameOfSubstance> </Chemical> <Chemical> <RegistryNumber>0</RegistryNumber> <NameOfSubstance UI="D047428">Protein Kinase Inhibitors</NameOfSubstance> </Chemical> <Chemical> <RegistryNumber>0</RegistryNumber> <NameOfSubstance UI="D011799">Quinazolines</NameOfSubstance> </Chemical> <Chemical> <RegistryNumber>EC 2.7.10.1</RegistryNumber> <NameOfSubstance UI="C512478">EGFR protein, human</NameOfSubstance> </Chemical> <Chemical> <RegistryNumber>EC 2.7.10.1</RegistryNumber> <NameOfSubstance UI="D066246">ErbB Receptors</NameOfSubstance> </Chemical> <Chemical> <RegistryNumber>EC 3.6.5.2</RegistryNumber> <NameOfSubstance UI="D016283">Proto-Oncogene Proteins p21(ras)</NameOfSubstance> </Chemical> <Chemical> <RegistryNumber>S65743JHBS</RegistryNumber> <NameOfSubstance UI="D000077156">Gefitinib</NameOfSubstance> </Chemical> </ChemicalList> <CitationSubset>IM</CitationSubset> <MeshHeadingList> <MeshHeading> <DescriptorName UI="D000328" MajorTopicYN="N">Adult</DescriptorName> </MeshHeading> <MeshHeading> <DescriptorName UI="D000368" MajorTopicYN="N">Aged</DescriptorName> </MeshHeading> <MeshHeading> <DescriptorName UI="D002289" MajorTopicYN="N">Carcinoma, Non-Small-Cell Lung</DescriptorName> <QualifierName UI="Q000188" MajorTopicYN="Y">drug therapy</QualifierName> <QualifierName UI="Q000235" MajorTopicYN="N">genetics</QualifierName> <QualifierName UI="Q000473" MajorTopicYN="N">pathology</QualifierName> </MeshHeading> <MeshHeading> <DescriptorName UI="D019008" MajorTopicYN="N">Drug Resistance, Neoplasm</DescriptorName> </MeshHeading> <MeshHeading> <DescriptorName UI="D066246" MajorTopicYN="N">ErbB Receptors</DescriptorName> <QualifierName UI="Q000037" MajorTopicYN="N">antagonists & inhibitors</QualifierName> </MeshHeading> <MeshHeading> <DescriptorName UI="D005260" MajorTopicYN="N">Female</DescriptorName> </MeshHeading> <MeshHeading> <DescriptorName UI="D000077156" MajorTopicYN="N">Gefitinib</DescriptorName> </MeshHeading> <MeshHeading> <DescriptorName UI="D006801" MajorTopicYN="N">Humans</DescriptorName> </MeshHeading> <MeshHeading> <DescriptorName UI="D008297" MajorTopicYN="N">Male</DescriptorName> </MeshHeading> <MeshHeading> <DescriptorName UI="D008875" MajorTopicYN="N">Middle Aged</DescriptorName> </MeshHeading> <MeshHeading> <DescriptorName UI="D009154" MajorTopicYN="N">Mutation</DescriptorName> </MeshHeading> <MeshHeading> <DescriptorName UI="D047428" MajorTopicYN="N">Protein Kinase Inhibitors</DescriptorName> <QualifierName UI="Q000008" MajorTopicYN="Y">administration & dosage</QualifierName> </MeshHeading> <MeshHeading> <DescriptorName UI="D016283" MajorTopicYN="N">Proto-Oncogene Proteins p21(ras)</DescriptorName> <QualifierName UI="Q000235" MajorTopicYN="Y">genetics</QualifierName> </MeshHeading> <MeshHeading> <DescriptorName UI="D011799" MajorTopicYN="N">Quinazolines</DescriptorName> <QualifierName UI="Q000008" MajorTopicYN="Y">administration & dosage</QualifierName> </MeshHeading> </MeshHeadingList> </MedlineCitation> <PubmedData> <History> <PubMedPubDate PubStatus="entrez"> <Year>2017</Year> <Month>9</Month> <Day>14</Day> <Hour>6</Hour> <Minute>0</Minute> </PubMedPubDate> <PubMedPubDate PubStatus="pubmed"> <Year>2017</Year> <Month>9</Month> <Day>14</Day> <Hour>6</Hour> <Minute>0</Minute> </PubMedPubDate> <PubMedPubDate PubStatus="medline"> <Year>2018</Year> <Month>5</Month> <Day>11</Day> <Hour>6</Hour> <Minute>0</Minute> </PubMedPubDate> </History> <PublicationStatus>ppublish</PublicationStatus> <ArticleIdList> <ArticleId IdType="pubmed">28901317</ArticleId> <ArticleId IdType="pii">JCanResTher_2017_13_4_699_214476</ArticleId> <ArticleId IdType="doi">10.4103/jcrt.JCRT_468_17</ArticleId> </ArticleIdList> </PubmedData> </PubmedArticle> </PubmedArticleSet>
方法一:xml.etree.cElementTre
# -*- coding: utf-8 -*- """ @Datetime: 2019/4/25 @Author: Zhang Yafei """ import os import re import threading import xml.etree.cElementTree as ET from concurrent.futures import ThreadPoolExecutor from itertools import chain import pandas as pd def pubmed_xml_parser(path): dir_name = path.split('\\')[0] print(dir_name) etree = ET.parse(path) root = etree.getroot() data_list = [] pmid_set = [] for articles in root.iter('PubmedArticle'): pmid = articles.find('MedlineCitation').find('PMID').text if pmid in pmid_set: continue pmid_set.append(pmid) Article = articles.find('MedlineCitation').find('Article') journal = Article.find('Journal').find('ISOAbbreviation').text try: authors = Article.find('AuthorList').findall('Author') affiliations_info = set() for author in authors: # author_name = author.find('LastName').text + ' ' + author.find('ForeName').text affiliations = [x.find('Affiliation').text for x in author.findall('AffiliationInfo')] # author = author_name + ':' + ';'.join(affiliations) for affiliation in affiliations: affiliations_info.add(affiliation) affiliations_info = ';'.join(affiliations_info) except AttributeError: affiliations_info = '' try: date = Article.find('Journal').find('JournalIssue').find('PubDate').find('Year').text except AttributeError: date = Article.find('Journal').find('JournalIssue').find('PubDate').find('MedlineDate').text date = re.search('\d+', date).group(0) try: mesh_words = [] for mesh_heading in articles.find('MedlineCitation').find('MeshHeadingList').findall('MeshHeading'): if len(list(mesh_heading)) == 1: mesh_words.append(list(mesh_heading)[0].text) continue mesh_name = '' for mesh in mesh_heading: if mesh.tag == 'DescriptorName': mesh_name = mesh.text continue if mesh_name and mesh.tag == 'QualifierName': mesh_word = mesh_name + '/' + mesh.text mesh_words.append(mesh_word) mesh_words = ';'.join(mesh_words) except AttributeError: print(articles.find('MedlineCitation').find('PMID').text) mesh_words = '' article_type = '/'.join([x.text for x in Article.find('PublicationTypeList').getchildren()]) country = articles.find('MedlineCitation').find('MedlineJournalInfo').find('Country').text data_list.append( {'PMID': pmid, 'journal': journal, 'affiliations_info': affiliations_info, 'pub_year': date, 'mesh_words': mesh_words, 'country': country, 'article_type': article_type, 'file_path': path}) print(pmid + '\t解析完成') df = pd.DataFrame(data_list) with threading.Lock(): df.to_csv('{}.csv'.format(dir_name), encoding='utf_8_sig', mode='a', index=False, header=False) def to_excel(data, path): writer = pd.ExcelWriter(path) data.to_excel(writer, sheet_name='table', index=False) writer.save() def get_files_path(): for base_path, folders, files in os.walk('first in class drug'): file_list = [os.path.join(base_path, file) for file in files if file.endswith('.xml')] for base_path, folders, files in os.walk('follow on drug'): file_list.extend([os.path.join(base_path, file) for file in files if file.endswith('.xml')]) for base_path, folders, files in os.walk('me too drug'): file_list.extend([os.path.join(base_path, file) for file in files if file.endswith('.xml')]) if os.path.exists('first in class drug.csv') or os.path.exists('follow on drug.csv') or os.path.exists( 'me too drug.csv'): if os.path.exists('first in class drug.csv'): df = pd.read_csv('first in class drug.csv', encoding='utf-8') has_files_list = df.file_path.tolist() if os.path.exists('follow on drug.csv'): df = pd.read_csv('follow on drug.csv', encoding='utf-8') has_files_list = chain(has_files_list, df.file_path.tolist()) if os.path.exists('me too drug.csv'): df = pd.read_csv('me too drug.csv', encoding='utf-8') has_files_list = chain(has_files_list, df.file_path.tolist()) print('共需解析文件:{0}'.format(len(file_list))) has_files_list = set(has_files_list) file_list = set(file_list) - has_files_list print('已解析文件:{0}'.format(len(has_files_list))) else: df = pd.DataFrame( columns=['PMID', 'affiliations_info', 'article_type', 'country', 'file_path', 'journal', 'mesh_words', 'pub_year']) df.to_csv('follow on drug.csv', encoding='utf_8_sig', index=False) df.to_csv('first in class drug.csv', encoding='utf_8_sig', index=False) df.to_csv('me too drug.csv', encoding='utf_8_sig', index=False) print('共需解析文件:{0}'.format(len(file_list))) print('已解析文件:0') return file_list if __name__ == '__main__': files_list = get_files_path() if not files_list: print('全部解析完成') else: with ThreadPoolExecutor(max_workers=os.cpu_count()) as pool: pool.map(pubmed_xml_parser, files_list)
方法二:lxml+xpath
# -*- coding: utf-8 -*- """ @Datetime: 2019/4/26 @Author: Zhang Yafei """ import os import re import threading from concurrent.futures import ThreadPoolExecutor from lxml import etree import pandas as pd def pubmed_xpath_parse(path): tree = etree.parse(path) # 如果xml数据中出现了关于dtd的声明(如下面的例子),那样的话,必须在使用lxml解析xml的时候,进行相应的声明。 # parser = etree.XMLParser(load_dtd=True) # 首先根据dtd得到一个parser(注意dtd文件要放在和xml文件相同的目录) # tree = etree.parse('1.xml', parser=parser) # 用上面得到的parser将xml解析为树结构 data_list = [] pmid_set = [] for articles in tree.xpath('//PubmedArticle'): # pmid = articles.xpath('MedlineCitation/PMID')[0].xpath('string()') pmid = articles.xpath('MedlineCitation/PMID/text()')[0] if pmid in pmid_set: continue pmid_set.append(pmid) Article = articles.xpath('MedlineCitation/Article')[0] journal = Article.xpath('Journal/ISOAbbreviation/text()')[0] try: authors = Article.xpath('AuthorList/Author') affiliations_info = set() for author in authors: # author_name = author.find('LastName').text + ' ' + author.find('ForeName').text affiliations = [x.xpath('Affiliation/text()')[0] for x in author.xpath('AffiliationInfo')] # author = author_name + ':' + ';'.join(affiliations) for affiliation in affiliations: affiliations_info.add(affiliation) affiliations_info = ';'.join(affiliations_info) except AttributeError: affiliations_info = '' try: date = Article.xpath('Journal/JournalIssue/PubDate/Year/text()')[0] except IndexError: date = Article.xpath('Journal/JournalIssue/PubDate/MedlineDate/text()')[0] date = re.search('\d+', date).group(0) try: mesh_words = [] for mesh_heading in articles.xpath('MedlineCitation/MeshHeadingList/MeshHeading'): if len(mesh_heading.xpath('child::*')) == 1: mesh_words.append((mesh_heading.xpath('child::*'))[0].text) continue mesh_name = '' for mesh in mesh_heading.xpath('child::*'): if mesh.tag == 'DescriptorName': mesh_name = mesh.xpath('string()') continue if mesh_name and mesh.tag == 'QualifierName': mesh_word = mesh_name + '/' + mesh.xpath('string()') mesh_words.append(mesh_word) mesh_words = ';'.join(mesh_words) except AttributeError: mesh_words = '' article_type = '/'.join([x.xpath('./text()')[0] for x in Article.xpath('PublicationTypeList/PublicationType')]) country = articles.xpath('MedlineCitation/MedlineJournalInfo/Country/text()')[0] data_list.append( {'PMID': pmid, 'journal': journal, 'affiliations_info': affiliations_info, 'pub_year': date, 'mesh_words': mesh_words, 'country': country, 'article_type': article_type, 'file_path': path}) print(pmid + '\t解析完成') df = pd.DataFrame(data_list) with threading.Lock(): df.to_csv('pubmed.csv', encoding='utf_8_sig', mode='a', index=False, header=False) def to_excel(data, path): writer = pd.ExcelWriter(path) data.to_excel(writer, sheet_name='table', index=False) writer.save() def get_files_path(): for base_path, folders, files in os.walk('first in class drug'): file_list = [os.path.join(base_path, file) for file in files if file.endswith('.xml')] for base_path, folders, files in os.walk('follow on drug'): file_list.extend([os.path.join(base_path, file) for file in files if file.endswith('.xml')]) for base_path, folders, files in os.walk('me too drug'): file_list.extend([os.path.join(base_path, file) for file in files if file.endswith('.xml')]) if os.path.exists('pubmed.csv'): df = pd.read_csv('pubmed.csv', encoding='utf-8') has_files_list = df.file_path print('共需解析文件:{0}'.format(len(file_list))) file_list = set(file_list) - set(has_files_list) print('已解析文件:{0}'.format(len(set(has_files_list)))) else: df = pd.DataFrame(columns=['PMID','affiliations_info','article_type','country','file_path','journal','mesh_words','pub_year']) df.to_csv('pubmed.csv', encoding='utf_8_sig', index=False) print('共需解析文件:{0}'.format(len(file_list))) print('已解析文件:0') return file_list if __name__ == '__main__': files_list = get_files_path() if not files_list: print('全部解析完成') else: pool = ThreadPoolExecutor(max_workers=os.cpu_count()) pool.map(pubmed_xpath_parse, files_list)