python识别文档英语单词后查找翻译并保存为xlsx

代码:

# -*- coding: utf-8 -*-
"""
Created on Fri Aug  5 17:11:50 2022

@author: koneko
"""
import requests
import docx
import re
import sqlite3
import openpyxl

   

def translate(keyword):
    url = 'https://fanyi.baidu.com/sug'
    data = {'kw': keyword}
    html = requests.post(url,data)
    json = html.json()
    if json['data'] == []:
        return []
    else:
        return json['data'][0]['v']


def lang_detect(keyword):
    url = 'https://fanyi.baidu.com/langdetect'
    data = {'query':keyword}
    html = requests.post(url, data)
    json = html.json()
    return json['lan']


def load_docx_and_get_words(fileName):
    doc = docx.Document(fileName)
    text = ''
    
    for paragraph in doc.paragraphs:
        text += paragraph.text
        
    text = text.lower()
    words = re.findall(r'[A-Za-z]+', text)[:10]
    words =  list(set(words))
    print('总共解析出'+str(len(words))+'个单词')
    return words
    

def words_filter(words):
    for i, word in enumerate(words):
        print(i, word)
        if len(word) <= 2:
            print('remove '+ word +' for length <= 2')
            words.remove(word)
            continue
        lan = lang_detect(word)
        if lan != 'en':
            print('remove '+ word + ' for not english' )
            words.remove(word)
            continue
    print('清理后共'+str(len(words))+'个单词')
    return words



def words_to_dictionary(words):
    dictionary = dict()
    for word in words:
        trans = translate(word)
        if trans == []:
            remains = 3
            while remains and trans == []:
                trans = translate(word)
                remains -= 1
        if trans == []:
            print(word,'找不到翻译')
            continue
        print(word)
        print(trans)
        dictionary[word] = trans
    
    #按键(字母顺序)进行排序,这里会变成list    
    dictionary = sorted(dictionary.items(), key = lambda x:x[0])
    return dict(dictionary)


def save_to_xlsx(fileName, dictionary):
    wb = openpyxl.Workbook()
    ws = wb.active
    ws.title = 'vocabulary'
    for row, item in enumerate(dictionary.items()):
        ws.cell(row+1, 1).value = item[0]
        ws.cell(row+1, 2).value = item[1]
    
    wb.save(fileName+'.xlsx')


words = load_docx_and_get_words('cet4-1.docx')
words = words_filter(words,)
dictionary = words_to_dictionary(words)

save_to_xlsx('myVocabulary', dictionary) 




    
posted @ 2022-08-05 17:31  裏表異体  阅读(65)  评论(0编辑  收藏  举报