一、需求描述
- 新增词书分类(三级)
- 使用新接口获取单词释义
- 单词加上bgColor属性
二、需求拆解
2.1 词书分类
2.2 使用新接口获取单词释义
Step1. 获取所有单词列表
Step2. 清理两类单词
2.2.1 哪些需要清理
- 当前词典中无例句或释义的
- 当前词典中没有的
2.2.2 步骤
- 整理出新增词书的所有单词
- 筛选出当前词典中无例句和释义的单词列表
- set
- 爬取所有须新增的单词 https://api.dictionaryapi.dev/api/v2/entries/en/hello
- 爬取音标文件,及下载音频 oxfordlearnersdictionaries
- 调用百度翻译api,翻译例句
2.2.3 容易出错的地方
- 缓存不一致
- 词组过一遍百度api,获取简明释义
- 音频资源也要重新生成一遍
- 保留哪些部分:
- 只保留有例句的
- 如果都没有例句,则保证一个词性至少有一个定义
3. 单词加上bgColor属性
三、代码
1. 获取当前词书的wordList
输出为newBookAllWordList.json文件
[
"hold-up",
"overheating",
"answerphone",
"syncretize",
"pedantry",
"poohed",
"national highway",
"overstate",
"scenery",
"evenly spaced"
]
# get all csv
import os
import csv
import json
def main():
allWordList = []
path = '.'
files = os.listdir(path)
#sub folder
for file in files:
if os.path.isdir(file):
subfiles = os.listdir(file)
for subfile in subfiles:
if subfile.endswith('.csv'):
with open(file+"/"+subfile,mode='r') as f:
reader = csv.reader(f)
header = next(reader)
# 逐行获取数据,并输出
for row in reader:
# row[0]为单词
allWordList.append(row[0])
allWordList = list(set(allWordList))
with open("newBookAllWordList.json",mode="w",encoding="utf-8") as f:
json.dump(allWordList,f,ensure_ascii=False,indent=2)
if __name__ == '__main__':
main()
2. 获取当前数据库中,无释义和无例句的单词List
规则:
- 新增词书中新增的单词
- base_word中缺少example的
- base_word中缺少explain的
- example为 ’Currently no example‘的
- 去掉词组
输出为mergeList.json
import mysql.connector
import json
# 获取词典中所有的单词
wordList = []
# 获取所有的explain
explainList = []
# 获取所有的example
exampleList = []
# 新增词书的所有单词
newBookAllWordList = []
# 新增词书中有,但数据库无的
newBookMissWordList = []
emptyExplainWordList = []
missExplainWordList = []
emptyExampleWordList = []
missExampleWordList = []
# 无例句的存储
exampleEmptyStr = "Currently no example."
def saveAllWordListFromDB():
conn = mysql.connector.connect(user='root', password='HPyuko12!!', database='a4')
cursor = conn.cursor()
cursor.execute('select name from base_word')
values = cursor.fetchall()
for value in values:
wordList.append(value[0])
with open('dbAllWordList.json', 'w') as f:
json.dump(wordList, f)
def saveAllExplainListFromDB():
conn = mysql.connector.connect(user='root', password='HPyuko12!!', database='a4')
cursor = conn.cursor()
cursor.execute('select * from `explain`')
values = cursor.fetchall()
for value in values:
explainList.append(value)
# save json
with open('dbExplainList.json', 'w') as f:
json.dump(explainList, f)
def saveAllExampleListFromDB():
conn = mysql.connector.connect(user='root', password='HPyuko12!!', database='a4')
cursor = conn.cursor()
cursor.execute('select * from example')
values = cursor.fetchall()
for value in values:
exampleList.append(value)
# save json
with open('dbExampleList.json', 'w') as f:
json.dump(exampleList, f)
def checkEmptyExample():
# load json
with open('dbAllWordList.json', 'r') as f:
wordList = json.load(f)
with open('dbExampleList.json', 'r') as f:
exampleList = json.load(f)
# check
for example in exampleList:
if example[2] == exampleEmptyStr:
emptyExampleWordList.append(example[1])
with open('dbEmptyExampleWordList.json', 'w') as f:
json.dump(emptyExampleWordList, f)
# 即wordList中有的,而exampleList中没有的
def checkMissExample():
with open('dbAllWordList.json', 'r') as f:
wordList = json.load(f)
with open('dbExampleList.json', 'r') as f:
exampleList = json.load(f)
exmapleWordList = []
for example in exampleList:
exmapleWordList.append(example[1])
# check
for word in wordList:
if word not in exmapleWordList:
missExampleWordList.append(word)
with open('dbMissExampleWordList.json', 'w') as f:
json.dump(missExampleWordList, f)
def checkMissExplain():
with open('dbAllWordList.json', 'r') as f:
wordList = json.load(f)
with open('dbExplainList.json', 'r') as f:
explainList = json.load(f)
explainWordList = []
for explain in explainList:
explainWordList.append(explain[1])
# check
for word in wordList:
if word not in explainWordList:
missExplainWordList.append(word)
with open('dbMissExplainWordList.json', 'w') as f:
json.dump(missExplainWordList, f)
def checkNewWord():
with open('dbAllWordList.json', 'r') as f:
wordList = json.load(f)
with open('newBookAllWordList.json', 'r') as f:
newBookAllWordList = json.load(f)
for word in newBookAllWordList:
if word not in wordList:
newBookMissWordList.append(word)
with open('newBookMissWordList.json', 'w') as f:
json.dump(newBookMissWordList, f)
def mergeAll():
# with open('dbAllWordList.json', 'r') as f:
# wordList = json.load(f)
with open('dbEmptyExampleWordList.json', 'r') as f:
emptyExampleWordList = json.load(f)
with open('dbMissExampleWordList.json', 'r') as f:
missExampleWordList = json.load(f)
with open('dbMissExplainWordList.json', 'r') as f:
missExplainWordList = json.load(f)
with open('newBookMissWordList.json', 'r') as f:
newBookMissWordList = json.load(f)
# merge
mergeList = []
# for word in wordList:
# mergeList.append(word)
for word in emptyExampleWordList:
mergeList.append(word)
for word in missExampleWordList:
mergeList.append(word)
for word in missExplainWordList:
mergeList.append(word)
for word in newBookMissWordList:
mergeList.append(word)
# 去除词组
mergeList = [word for word in mergeList if len(word.split()) == 1]
with open('mergeList.json', 'w') as f:
print(len(list(set(mergeList))))
json.dump(list(set(mergeList)), f)
if __name__ == '__main__':
# 从数据库中提取当前的信息
# saveAllWordListFromDB()
# saveAllExampleListFromDB()
# saveAllExplainListFromDB()
# 检查example的空值
# checkEmptyExample()
# 检查example的缺失
# checkMissExample()
# 检查explain的缺失
# checkMissExplain()
# checkNewWord()
mergeAll()
3. 从词典API中爬取mergeList的单词,并先原样存储
redis中保存404的单词 .\data\craw\ {word}.json
本地保存各个单词的json文件 404wordList.json
import requests
import json
import os
import threadpool
from concurrent.futures import ThreadPoolExecutor
import random
import time
import redis
api_url = "https://api.dictionaryapi.dev/api/v2/entries/en/{0}"
index = 0
total = 0
def lower(word):
if(word.split(".")[-1] == "json"):
return word.split(".")[0].lower()
else:
return word.lower()
pool = redis.ConnectionPool(host='localhost', port=6379, decode_responses=True)
def loadAllNeedCrawlWords():
global total,pool
r = redis.Redis(connection_pool=pool)
existWordList = os.listdir("../data/crawl")
existWordList = list(map(lower, existWordList))
# 去除404的
notFoundWordList = list(r.smembers("crawl"))
# print(existWordList)
r = redis.Redis(connection_pool=pool)
with open("../data/mergeList.json", "r") as f:
data = json.load(f)
# 去除已经爬取过的
data = list(filter(lambda x: x not in existWordList, data))
data = list(filter(lambda x: x not in notFoundWordList, data))
# shuffle
random.shuffle(data)
# data to lower case
wordListLower = list(map(lower,data))
total = len(wordListLower)
# multi thread
threadPool = threadpool.ThreadPool(50)
requests = threadpool.makeRequests(crawl, wordListLower)
# [pool.putRequest(req) for req in requests]
for req in requests:
threadPool.putRequest(req)
threadPool.wait()
def crawl(word):
# global index
global index,total,pool
index += 1
# progress
print("{0}/{1}".format(index,total))
response = requests.get(api_url.format(word))
if response.status_code == 200:
with open("../data/crawl/{0}.json".format(word), "w") as f:
f.write(response.text)
else:
if(response.status_code == 404):
# 存该网站上无释义的单词
r = redis.Redis(connection_pool=pool)
r.sadd("crawl", word)
print(word)
print(response.text)
print(response.status_code)
time.sleep(1)
def main():
loadAllNeedCrawlWords()
if __name__ == "__main__":
main()
4. 百度API
import requests
import re
import json
import execjs
def baiduTrans(word):
token_url = 'https://fanyi.baidu.com'
post_url = 'https://fanyi.baidu.com/v2transapi?from=en&to=zh'
headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8',
'cache-control': 'no-cache',
'cookie': '[COOKIE]',
'pragma': 'no-cache',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36'
}
index_html = requests.get(url=token_url, headers=headers).text
pattern1 = r"""window\['common'\] = {.*?token:(.*?),"""
token = re.findall(pattern1, index_html, re.S)[0]
pattern2 = r"""window.gtk = (.*?);\n"""
gtk = re.findall(pattern2,index_html,re.S)[0]
token = token.replace("\'","").strip()
gtk = gtk.replace("\"","").strip()
with open('translate.js','r') as f:
data = f.read()
ex = execjs.compile(data)
sign = ex.eval('e("{}","{}")'.format(word, gtk))
# if request.POST.get('type')=='汉译英':
data = {'from': 'en',
'to': 'zh',
'query': word,
'transtype': 'translang',
'simple_means_flag': '3',
'domain': 'common',
'sign': sign,
'token': token
}
r_html = requests.post(url=post_url, data=data, headers=headers)
# 存储
with open(str(word)+".json",mode="w") as f:
transText = r_html.json()
json.dump(transText,f,ensure_ascii=False)
if __name__ == '__main__':
baiduTrans("world")
function a(r) {
if (Array.isArray(r)) {
for (var o = 0, t = Array(r.length); o < r.length; o++)
t[o] = r[o];
return t
}
return Array.from(r)
}
function n(r, o) {
for (var t = 0; t < o.length - 2; t += 3) {
var a = o.charAt(t + 2);
a = a >= "a" ? a.charCodeAt(0) - 87 : Number(a),
a = "+" === o.charAt(t + 1) ? r >>> a : r << a,
r = "+" === o.charAt(t) ? r + a & 4294967295 : r ^ a
}
return r
}
function e(r,gtk) {
var o = r.match(/[\uD800-\uDBFF][\uDC00-\uDFFF]/g);
if (null === o) {
var t = r.length;
t > 30 && (r = "" + r.substr(0, 10) + r.substr(Math.floor(t / 2) - 5, 10) + r.substr(-10, 10))
} else {
for (var e = r.split(/[\uD800-\uDBFF][\uDC00-\uDFFF]/), C = 0, h = e.length, f = []; h > C; C++)
"" !== e[C] && f.push.apply(f, a(e[C].split(""))),
C !== h - 1 && f.push(o[C]);
var g = f.length;
g > 30 && (r = f.slice(0, 10).join("") + f.slice(Math.floor(g / 2) - 5, Math.floor(g / 2) + 5).join("") + f.slice(-10).join(""))
}
var u = void 0
, l = "" + String.fromCharCode(103) + String.fromCharCode(116) + String.fromCharCode(107);
u = null !== i ? i : (i = gtk || "") || "";
for (var d = u.split("."), m = Number(d[0]) || 0, s = Number(d[1]) || 0, S = [], c = 0, v = 0; v < r.length; v++) {
var A = r.charCodeAt(v);
128 > A ? S[c++] = A : (2048 > A ? S[c++] = A >> 6 | 192 : (55296 === (64512 & A) && v + 1 < r.length && 56320 === (64512 & r.charCodeAt(v + 1)) ? (A = 65536 + ((1023 & A) << 10) + (1023 & r.charCodeAt(++v)),
S[c++] = A >> 18 | 240,
S[c++] = A >> 12 & 63 | 128) : S[c++] = A >> 12 | 224,
S[c++] = A >> 6 & 63 | 128),
S[c++] = 63 & A | 128)
}
for (var p = m, F = "" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(97) + ("" + String.fromCharCode(94) + String.fromCharCode(43) + String.fromCharCode(54)), D = "" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(51) + ("" + String.fromCharCode(94) + String.fromCharCode(43) + String.fromCharCode(98)) + ("" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(102)), b = 0; b < S.length; b++)
p += S[b],
p = n(p, F);
return p = n(p, D),
p ^= s,
0 > p && (p = (2147483647 & p) + 2147483648),
p %= 1e6,
p.toString() + "." + (p ^ m)
}
var i = null;
5. 音标及发音
from oxford.oxford import Word
import json
import os
import threadpool
from concurrent.futures import ThreadPoolExecutor
import random
import redis
pool = redis.ConnectionPool(host='localhost', port=6379, decode_responses=True)
def lower(word):
if(word.split(".")[-1] == "json"):
return word.split(".")[0].lower()
else:
return word.lower()
def getAndSearchWord(word):
try:
Word.get(word)
with open("../data/pronunciations/"+word+".json",mode="w") as f:
json.dump(Word.pronunciations(),f,ensure_ascii=False)
except Exception as e:
r = redis.Redis(connection_pool=pool)
r.sadd("notPron", word)
print(word)
def main():
global total,pool
r = redis.Redis(connection_pool=pool)
needCrawlWordList = []
downloadFiles = os.listdir("../data/pronunciations/")
totalFiles = os.listdir("../data/crawl/")
needCrawlWordListFile = list(set(totalFiles) - set(downloadFiles))
notFoundWordList = list(r.smembers("notPron"))
print("notFoundWordList",notFoundWordList)
for tmp in needCrawlWordListFile:
needCrawlWordList.append(tmp.replace(".json",""))
needCrawlWordList = list(filter(lambda x: x not in notFoundWordList, needCrawlWordList))
random.shuffle(needCrawlWordList)
wordListLower = list(map(lower,needCrawlWordList))
threadPool = threadpool.ThreadPool(24)
requests = threadpool.makeRequests(getAndSearchWord, wordListLower)
# [pool.putRequest(req) for req in requests]
for req in requests:
threadPool.putRequest(req)
threadPool.wait()
if __name__ == '__main__':
main()