代码:
# -*- coding: utf-8 -*-
"""
Created on Fri Aug 5 17:11:50 2022
@author: koneko
"""
import requests
import docx
import re
import sqlite3
import openpyxl
def translate(keyword):
url = 'https://fanyi.baidu.com/sug'
data = {'kw': keyword}
html = requests.post(url,data)
json = html.json()
if json['data'] == []:
return []
else:
return json['data'][0]['v']
def lang_detect(keyword):
url = 'https://fanyi.baidu.com/langdetect'
data = {'query':keyword}
html = requests.post(url, data)
json = html.json()
return json['lan']
def load_docx_and_get_words(fileName):
doc = docx.Document(fileName)
text = ''
for paragraph in doc.paragraphs:
text += paragraph.text
text = text.lower()
words = re.findall(r'[A-Za-z]+', text)[:10]
words = list(set(words))
print('总共解析出'+str(len(words))+'个单词')
return words
def words_filter(words):
for i, word in enumerate(words):
print(i, word)
if len(word) <= 2:
print('remove '+ word +' for length <= 2')
words.remove(word)
continue
lan = lang_detect(word)
if lan != 'en':
print('remove '+ word + ' for not english' )
words.remove(word)
continue
print('清理后共'+str(len(words))+'个单词')
return words
def words_to_dictionary(words):
dictionary = dict()
for word in words:
trans = translate(word)
if trans == []:
remains = 3
while remains and trans == []:
trans = translate(word)
remains -= 1
if trans == []:
print(word,'找不到翻译')
continue
print(word)
print(trans)
dictionary[word] = trans
#按键(字母顺序)进行排序,这里会变成list
dictionary = sorted(dictionary.items(), key = lambda x:x[0])
return dict(dictionary)
def save_to_xlsx(fileName, dictionary):
wb = openpyxl.Workbook()
ws = wb.active
ws.title = 'vocabulary'
for row, item in enumerate(dictionary.items()):
ws.cell(row+1, 1).value = item[0]
ws.cell(row+1, 2).value = item[1]
wb.save(fileName+'.xlsx')
words = load_docx_and_get_words('cet4-1.docx')
words = words_filter(words,)
dictionary = words_to_dictionary(words)
save_to_xlsx('myVocabulary', dictionary)