用python从网页下载单词库

从网站下载单词库

1 每一页有几百个单词
2 每一个单词有独立的URL,URL中包含单词的中文解释
3 使用的库 requests,pyquery,web

#coding:utf-8

import requests as rq
from pyquery import PyQuery as pq
import web
import threading

db = web.database(dbn="sqlite",db="gre.db")


def initDB():
    _initSQL = """
        CREATE TABLE IF NOT EXISTS gre_word(
            id INTEGER PRIMARY KEY,
            word VARCHAR(200),
            meaning VARCHAR(200)
        )
    """
    db.query(_initSQL)

def get_html(url):
    """获取html文档"""
    headers={
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36'
    }    
    response = rq.get(url=url,headers=headers)
    return response.content

def get_words(html):
    """
        获取单词表,返回值:[(单词,单词链接)]
    """
    url_pre = "https://www.koolearn.com"
    word = pq(html)
    words = [(i.text(),"{}{}".format(url_pre,i.attr("href"))) for i in word(".word-box a.word").items()]
    return words

def insert_words(word,meaning):
    """插入单词"""
    db.insert(
        "gre_word",
        word = word,
        meaning = meaning,
    )
    print("insert ok!")
        
def get_word_meaning(wordURL):
    """获取单词含义"""
    doc = pq(get_html(wordURL))
    try:
        meaning = doc("li.clearfix").text()
    except:
        meaning = ""
    return meaning
    
if __name__ == "__main__":
    url = "https://www.koolearn.com/dict/tag_921_{}.html"
    import time
    startT = time.time()
    db.delete("gre_word",where="1")
    for i in range(1,31):
        html_ = get_html(url.format(i))
        words = get_words(html_)
        with db.transaction():
            for j in words:
                word,meaning = j[0],get_word_meaning(j[1])
                insert_words(word,meaning)
    endT = time.time()
    print(endT - startT)

没有使用线程。

posted @ 2022-09-21 00:14  daivlin  阅读(77)  评论(0编辑  收藏  举报