python: xmlhelper

 

https://github.com/tesseract-ocr/tesseract
Tesseract引擎和中文包 (这是HP实验室最早开发的OCR)
https://pan.baidu.com/share/init?surl=XpeRVgiPTU7mmiMiyaXThg
pyth
https://digi.bib.uni-mannheim.de/tesseract/


https://github.com/ViewFaceCore/ViewFaceCore

pip install beautifulsoup4

 

xml:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
<?xml version="1.0"?>
<data>
    <country name="Liechtenstein">
        <rank>1</rank>
        <year>2008</year>
        <gdppc>141100</gdppc>
        <neighbor name="Austria" direction="E"/>
        <neighbor name="Switzerland" direction="W"/>
    </country>
    <country name="Singapore">
        <rank>4</rank>
        <year>2011</year>
        <gdppc>59900</gdppc>
        <neighbor name="Malaysia" direction="N"/>
    </country>
    <country name="Panama">
        <rank>68</rank>
        <year>2011</year>
        <gdppc>13600</gdppc>
        <neighbor name="Costa Rica" direction="W"/>
        <neighbor name="Colombia" direction="E"/>
    </country>
</data>

  

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
# encoding: utf-8
# 版权所有 2023 涂聚文有限公司
# 许可信息查看:
# 描述: pip install beautifulsoup4
# Author    : geovindu,Geovin Du 涂聚文.
# IDE       : PyCharm 2023.1 python 311
# Datetime  : 2023/7/16 22:17
# User      : geovindu
# Product   : PyCharm
# Project   : pythonTkinterDemo
# File      : XmlHelper.py
# explain   : 学习
 
 
from xml.dom import minidom
import xml.etree.ElementTree as ET
import csv
import requests
import os
import sys
 
def readXml(url):
    tree = ET.parse(url)
    root = tree.getroot()
    for child in root:
        print(child.tag, child.attrib)
 
 
def writeXml(url):
    # 实例化Document树
    doc = minidom.Document()
    # 创建根结点,XML必须存在root元素
    root_node = doc.createElement('root')
    # 将元素挂载在doc树中
    doc.appendChild(root_node)
 
    # 创建子元素
    c_node1 = doc.createElement('movie')
    root_node.appendChild(c_node1)
    # 设置该元素存储数据
    c_node1.setAttribute('shelf', 'New Arrivals')
    # 二级子结点
    c_node2 = doc.createElement('type')
    c_node1.appendChild(c_node2)
    # 也用DOM创建文本结点,把文本结点(文字内容)看成子结点
    c_text = doc.createTextNode("War, Thriller")
    c_node2.appendChild(c_text)
    try:
        with open(url, 'w', encoding='UTF-8') as f:
            # 第一个参数是目标文件对象
            doc.writexml(f, indent='', addindent='\t', newl='\n', encoding='UTF-8')
    except Exception as e:
        print('错误:', e)
 
 
def loadRSS():
    # url of rss feed
    url = 'http://www.hindustantimes.com/rss/topnews/rssfeed.xml'
 
    # creating HTTP response object from given url
    resp = requests.get(url)
 
    # saving the xml file
    with open('topnewsfeed.xml', 'wb') as f:
        f.write(resp.content)
 
 
def parseXML(xmlfile):
    # create element tree object
    tree = ET.parse(xmlfile)
 
    # get root element
    root = tree.getroot()
 
    # create empty list for news items
    newsitems = []
 
    # iterate news items
    for item in root.findall('./channel/item'):
 
        # empty news dictionary
        news = {}
 
        # iterate child elements of item
        for child in item:
 
            # special checking for namespace object content:media
            if child.tag == '{http://search.yahoo.com/mrss/}content':
                news['media'] = child.attrib['url']
            else:
                news[child.tag] = child.text.encode('utf8')
 
        # append news dictionary to news items list
        newsitems.append(news)
 
    # return news items list
    return newsitems
 
 
def savetoCSV(newsitems, filename):
    # specifying the fields for csv file
    fields = ['guid', 'title', 'pubDate', 'description', 'link', 'media']
 
    # writing to csv file
    with open(filename, 'w') as csvfile:
        # creating a csv dict writer object
        writer = csv.DictWriter(csvfile, fieldnames=fields)
 
        # writing headers (field names)
        writer.writeheader()
 
        # writing data rows
        writer.writerows(newsitems)
 
 
def main():
    # load rss from web to update existing xml file
    loadRSS()
 
    # parse xml file
    newsitems = parseXML('topnewsfeed.xml')
 
    # store news items in a csv file
    savetoCSV(newsitems, 'geovindu.csv')

  

posted @   ®Geovin Du Dream Park™  阅读(8)  评论(0编辑  收藏  举报
相关博文:
阅读排行:
· 25岁的心里话
· 闲置电脑爆改个人服务器(超详细) #公网映射 #Vmware虚拟网络编辑器
· 基于 Docker 搭建 FRP 内网穿透开源项目(很简单哒)
· 零经验选手,Compose 一天开发一款小游戏!
· 一起来玩mcp_server_sqlite,让AI帮你做增删改查!!
历史上的今天:
2015-07-16 learning sql (second edition) script
2013-07-16 javascript: iframe switchSysBar 左欄打開關閉,兼容各瀏覽器操作
2013-07-16 SQL:exec sp_executesql 用法
< 2025年3月 >
23 24 25 26 27 28 1
2 3 4 5 6 7 8
9 10 11 12 13 14 15
16 17 18 19 20 21 22
23 24 25 26 27 28 29
30 31 1 2 3 4 5
点击右上角即可分享
微信分享提示