用python从网页下载单词库
从网站下载单词库
1 每一页有几百个单词
2 每一个单词有独立的URL,URL中包含单词的中文解释
3 使用的库 requests,pyquery,web
#coding:utf-8
import requests as rq
from pyquery import PyQuery as pq
import web
import threading
db = web.database(dbn="sqlite",db="gre.db")
def initDB():
_initSQL = """
CREATE TABLE IF NOT EXISTS gre_word(
id INTEGER PRIMARY KEY,
word VARCHAR(200),
meaning VARCHAR(200)
)
"""
db.query(_initSQL)
def get_html(url):
"""获取html文档"""
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36'
}
response = rq.get(url=url,headers=headers)
return response.content
def get_words(html):
"""
获取单词表,返回值:[(单词,单词链接)]
"""
url_pre = "https://www.koolearn.com"
word = pq(html)
words = [(i.text(),"{}{}".format(url_pre,i.attr("href"))) for i in word(".word-box a.word").items()]
return words
def insert_words(word,meaning):
"""插入单词"""
db.insert(
"gre_word",
word = word,
meaning = meaning,
)
print("insert ok!")
def get_word_meaning(wordURL):
"""获取单词含义"""
doc = pq(get_html(wordURL))
try:
meaning = doc("li.clearfix").text()
except:
meaning = ""
return meaning
if __name__ == "__main__":
url = "https://www.koolearn.com/dict/tag_921_{}.html"
import time
startT = time.time()
db.delete("gre_word",where="1")
for i in range(1,31):
html_ = get_html(url.format(i))
words = get_words(html_)
with db.transaction():
for j in words:
word,meaning = j[0],get_word_meaning(j[1])
insert_words(word,meaning)
endT = time.time()
print(endT - startT)
没有使用线程。