A4 - 需求迭代 - 新增词书分类及词典优化

黄鹏宇
发布于 2022-10-18 / 515 阅读
3
0

A4 - 需求迭代 - 新增词书分类及词典优化

一、需求描述

  1. 新增词书分类(三级)
  2. 使用新接口获取单词释义
  3. 单词加上bgColor属性

二、需求拆解

2.1 词书分类

2.2 使用新接口获取单词释义

Step1. 获取所有单词列表
Step2. 清理两类单词

2.2.1 哪些需要清理

  1. 当前词典中无例句或释义的
  2. 当前词典中没有的

2.2.2 步骤

  1. 整理出新增词书的所有单词
  2. 筛选出当前词典中无例句和释义的单词列表
  3. set
  4. 爬取所有须新增的单词 https://api.dictionaryapi.dev/api/v2/entries/en/hello
  5. 爬取音标文件,及下载音频 oxfordlearnersdictionaries
  6. 调用百度翻译api,翻译例句

2.2.3 容易出错的地方

  1. 缓存不一致
  2. 词组过一遍百度api,获取简明释义
  3. 音频资源也要重新生成一遍
  4. 保留哪些部分:
    1. 只保留有例句的
    2. 如果都没有例句,则保证一个词性至少有一个定义

3. 单词加上bgColor属性

三、代码

1. 获取当前词书的wordList

输出为newBookAllWordList.json文件

[
  "hold-up",
  "overheating",
  "answerphone",
  "syncretize",
  "pedantry",
  "poohed",
  "national highway",
  "overstate",
  "scenery",
  "evenly spaced"
]



# get all csv
import os
import csv
import json
def main():
    allWordList = []
    path = '.'
    files = os.listdir(path)
    #sub folder
    for file in files:
        if os.path.isdir(file):
            subfiles = os.listdir(file)
            for subfile in subfiles:
                if subfile.endswith('.csv'):
                    with open(file+"/"+subfile,mode='r') as f:
                            reader = csv.reader(f)
                            header = next(reader)
                            # 逐行获取数据,并输出
                            for row in reader:
                                # row[0]为单词
                                allWordList.append(row[0])
    allWordList = list(set(allWordList))    
    with open("newBookAllWordList.json",mode="w",encoding="utf-8") as f:
        json.dump(allWordList,f,ensure_ascii=False,indent=2)

if __name__ == '__main__':
    main()

2. 获取当前数据库中,无释义和无例句的单词List

规则:

  1. 新增词书中新增的单词
  2. base_word中缺少example的
  3. base_word中缺少explain的
  4. example为 ’Currently no example‘的
  5. 去掉词组

输出为mergeList.json

import mysql.connector
import json

# 获取词典中所有的单词
wordList = []
# 获取所有的explain
explainList = []
# 获取所有的example
exampleList = []

# 新增词书的所有单词
newBookAllWordList = []
# 新增词书中有,但数据库无的
newBookMissWordList = []

emptyExplainWordList = []
missExplainWordList = []

emptyExampleWordList = []
missExampleWordList = []

# 无例句的存储
exampleEmptyStr = "Currently no example."

def saveAllWordListFromDB():
	conn = mysql.connector.connect(user='root', password='HPyuko12!!', database='a4')
	cursor = conn.cursor()
	cursor.execute('select name from base_word')
	values = cursor.fetchall()
	for value in values:
		wordList.append(value[0])
	with open('dbAllWordList.json', 'w') as f:
		json.dump(wordList, f)

def saveAllExplainListFromDB():
	conn = mysql.connector.connect(user='root', password='HPyuko12!!', database='a4')
	cursor = conn.cursor()
	cursor.execute('select * from `explain`')
	values = cursor.fetchall()
	for value in values:
		explainList.append(value)
	# save json 
	with open('dbExplainList.json', 'w') as f:
		json.dump(explainList, f)

def saveAllExampleListFromDB():
	conn = mysql.connector.connect(user='root', password='HPyuko12!!', database='a4')
	cursor = conn.cursor()
	cursor.execute('select * from example')
	values = cursor.fetchall()
	for value in values:
		exampleList.append(value)
	# save json 
	with open('dbExampleList.json', 'w') as f:
		json.dump(exampleList, f)


def checkEmptyExample():
	# load json
	with open('dbAllWordList.json', 'r') as f:
		wordList = json.load(f)
	with open('dbExampleList.json', 'r') as f:
		exampleList = json.load(f)

		
	# check
	for example in exampleList:
		if example[2] == exampleEmptyStr:
			emptyExampleWordList.append(example[1])
	
	with open('dbEmptyExampleWordList.json', 'w') as f:
		json.dump(emptyExampleWordList, f)


# 即wordList中有的,而exampleList中没有的
def checkMissExample():
	with open('dbAllWordList.json', 'r') as f:
		wordList = json.load(f)
	with open('dbExampleList.json', 'r') as f:
		exampleList = json.load(f)
	exmapleWordList = []
	for example in exampleList:
		exmapleWordList.append(example[1])

	# check
	for word in wordList:
		if word not in exmapleWordList:
			missExampleWordList.append(word)
	with open('dbMissExampleWordList.json', 'w') as f:
		json.dump(missExampleWordList, f)

def checkMissExplain():
	with open('dbAllWordList.json', 'r') as f:
		wordList = json.load(f)
	with open('dbExplainList.json', 'r') as f:
		explainList = json.load(f)
	explainWordList = []
	for explain in explainList:
		explainWordList.append(explain[1])
	# check
	for word in wordList:
		if word not in explainWordList:
			missExplainWordList.append(word)
	with open('dbMissExplainWordList.json', 'w') as f:
		json.dump(missExplainWordList, f)

def checkNewWord():
	with open('dbAllWordList.json', 'r') as f:
		wordList = json.load(f)
	
	with open('newBookAllWordList.json', 'r') as f:
		newBookAllWordList = json.load(f)

	for word in newBookAllWordList:
		if word not in wordList:
			newBookMissWordList.append(word)
	with open('newBookMissWordList.json', 'w') as f:
		json.dump(newBookMissWordList, f)

def mergeAll():
	# with open('dbAllWordList.json', 'r') as f:
	# 	wordList = json.load(f)

	with open('dbEmptyExampleWordList.json', 'r') as f:
		emptyExampleWordList = json.load(f)
	with open('dbMissExampleWordList.json', 'r') as f:
		missExampleWordList = json.load(f)
	with open('dbMissExplainWordList.json', 'r') as f:
		missExplainWordList = json.load(f)
	with open('newBookMissWordList.json', 'r') as f:
		newBookMissWordList = json.load(f)

	# merge
	mergeList = []
	# for word in wordList:
	# 	mergeList.append(word)
	for word in emptyExampleWordList:
		mergeList.append(word)
	for word in missExampleWordList:
		mergeList.append(word)
	for word in missExplainWordList:
		mergeList.append(word)
	for word in newBookMissWordList:
		mergeList.append(word)
	# 去除词组
	mergeList = [word for word in mergeList if len(word.split()) == 1]
	with open('mergeList.json', 'w') as f:
		print(len(list(set(mergeList))))
		json.dump(list(set(mergeList)), f)

if __name__ == '__main__':
	# 从数据库中提取当前的信息
	# saveAllWordListFromDB()
	# saveAllExampleListFromDB()
	# saveAllExplainListFromDB()
	# 检查example的空值
	# checkEmptyExample()
	# 检查example的缺失
	# checkMissExample()
	# 检查explain的缺失
	# checkMissExplain()
	# checkNewWord()
	mergeAll()

3. 从词典API中爬取mergeList的单词,并先原样存储

redis中保存404的单词 .\data\craw\ {word}.json
本地保存各个单词的json文件 404wordList.json

import requests
import json
import os
import threadpool
from concurrent.futures import ThreadPoolExecutor
import random
import time
import redis


api_url = "https://api.dictionaryapi.dev/api/v2/entries/en/{0}"

index = 0
total = 0

def lower(word):
    if(word.split(".")[-1] == "json"):
        return word.split(".")[0].lower()
    else:
        return word.lower()

pool = redis.ConnectionPool(host='localhost', port=6379, decode_responses=True)

def loadAllNeedCrawlWords():
    global total,pool

    r = redis.Redis(connection_pool=pool)

    existWordList = os.listdir("../data/crawl")
    existWordList = list(map(lower, existWordList))
    # 去除404的
    notFoundWordList = list(r.smembers("crawl"))

    # print(existWordList)
    r = redis.Redis(connection_pool=pool)
    with open("../data/mergeList.json", "r") as f:
        data = json.load(f)    
        # 去除已经爬取过的
        data = list(filter(lambda x: x not in existWordList, data))
        data = list(filter(lambda x: x not in notFoundWordList, data))    
        # shuffle
        random.shuffle(data)
        # data to lower case
        wordListLower = list(map(lower,data))
        total = len(wordListLower)
        
        # multi thread
        threadPool = threadpool.ThreadPool(50)
        requests = threadpool.makeRequests(crawl, wordListLower) 
        # [pool.putRequest(req) for req in requests] 
        for req in requests:
            threadPool.putRequest(req)

        threadPool.wait()

def crawl(word):    
    # global index
    global index,total,pool
    index += 1
    # progress
    print("{0}/{1}".format(index,total))
    response = requests.get(api_url.format(word))

    if response.status_code == 200:
        with open("../data/crawl/{0}.json".format(word), "w") as f:
            f.write(response.text)
    else:
        if(response.status_code == 404):
            # 存该网站上无释义的单词
            r = redis.Redis(connection_pool=pool)
            r.sadd("crawl", word)
            print(word)
        print(response.text)
        print(response.status_code)
    time.sleep(1)

def main():
    loadAllNeedCrawlWords()

if __name__ == "__main__":
    main()

4. 百度API

import requests
import re
import json
import execjs

def baiduTrans(word):
    token_url = 'https://fanyi.baidu.com'
    post_url = 'https://fanyi.baidu.com/v2transapi?from=en&to=zh'
    headers =  {
        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
        'accept-encoding': 'gzip, deflate, br',
        'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8',
        'cache-control': 'no-cache',
        'cookie': '[COOKIE]',
        'pragma': 'no-cache',
        'upgrade-insecure-requests': '1',
        'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36'
    }
    index_html = requests.get(url=token_url, headers=headers).text
    pattern1 = r"""window\['common'\] = {.*?token:(.*?),"""
    token = re.findall(pattern1, index_html, re.S)[0]
    pattern2 = r"""window.gtk = (.*?);\n"""

    gtk = re.findall(pattern2,index_html,re.S)[0]
    token = token.replace("\'","").strip()
    gtk = gtk.replace("\"","").strip()

    with open('translate.js','r') as f:
        data = f.read()
    ex = execjs.compile(data)
    sign = ex.eval('e("{}","{}")'.format(word, gtk))

    # if request.POST.get('type')=='汉译英':
    data = {'from': 'en',
            'to': 'zh',
            'query': word,
            'transtype': 'translang',
            'simple_means_flag': '3',
            'domain': 'common',
            'sign': sign,
            'token': token
            }
    r_html = requests.post(url=post_url, data=data, headers=headers)
    # 存储
    with open(str(word)+".json",mode="w") as f:
        transText = r_html.json()
        json.dump(transText,f,ensure_ascii=False)

if __name__ == '__main__':
    baiduTrans("world")




function a(r) {
        if (Array.isArray(r)) {
            for (var o = 0, t = Array(r.length); o < r.length; o++)
                t[o] = r[o];
            return t
        }
        return Array.from(r)
    }
function n(r, o) {
    for (var t = 0; t < o.length - 2; t += 3) {
        var a = o.charAt(t + 2);
        a = a >= "a" ? a.charCodeAt(0) - 87 : Number(a),
        a = "+" === o.charAt(t + 1) ? r >>> a : r << a,
        r = "+" === o.charAt(t) ? r + a & 4294967295 : r ^ a
    }
    return r
}
function e(r,gtk) {
    var o = r.match(/[\uD800-\uDBFF][\uDC00-\uDFFF]/g);
    if (null === o) {
        var t = r.length;
        t > 30 && (r = "" + r.substr(0, 10) + r.substr(Math.floor(t / 2) - 5, 10) + r.substr(-10, 10))
    } else {
        for (var e = r.split(/[\uD800-\uDBFF][\uDC00-\uDFFF]/), C = 0, h = e.length, f = []; h > C; C++)
            "" !== e[C] && f.push.apply(f, a(e[C].split(""))),
            C !== h - 1 && f.push(o[C]);
        var g = f.length;
        g > 30 && (r = f.slice(0, 10).join("") + f.slice(Math.floor(g / 2) - 5, Math.floor(g / 2) + 5).join("") + f.slice(-10).join(""))
    }
    var u = void 0
      , l = "" + String.fromCharCode(103) + String.fromCharCode(116) + String.fromCharCode(107);
    u = null !== i ? i : (i = gtk || "") || "";
    for (var d = u.split("."), m = Number(d[0]) || 0, s = Number(d[1]) || 0, S = [], c = 0, v = 0; v < r.length; v++) {
        var A = r.charCodeAt(v);
        128 > A ? S[c++] = A : (2048 > A ? S[c++] = A >> 6 | 192 : (55296 === (64512 & A) && v + 1 < r.length && 56320 === (64512 & r.charCodeAt(v + 1)) ? (A = 65536 + ((1023 & A) << 10) + (1023 & r.charCodeAt(++v)),
        S[c++] = A >> 18 | 240,
        S[c++] = A >> 12 & 63 | 128) : S[c++] = A >> 12 | 224,
        S[c++] = A >> 6 & 63 | 128),
        S[c++] = 63 & A | 128)
    }
    for (var p = m, F = "" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(97) + ("" + String.fromCharCode(94) + String.fromCharCode(43) + String.fromCharCode(54)), D = "" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(51) + ("" + String.fromCharCode(94) + String.fromCharCode(43) + String.fromCharCode(98)) + ("" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(102)), b = 0; b < S.length; b++)
        p += S[b],
        p = n(p, F);
    return p = n(p, D),
    p ^= s,
    0 > p && (p = (2147483647 & p) + 2147483648),
    p %= 1e6,
    p.toString() + "." + (p ^ m)
}
var i = null;

5. 音标及发音

from oxford.oxford import Word
import json
import os
import threadpool
from concurrent.futures import ThreadPoolExecutor
import random
import redis

pool = redis.ConnectionPool(host='localhost', port=6379, decode_responses=True)

def lower(word):
    if(word.split(".")[-1] == "json"):
        return word.split(".")[0].lower()
    else:
        return word.lower()

def getAndSearchWord(word):
    try:
        Word.get(word)
        with open("../data/pronunciations/"+word+".json",mode="w") as f:
            json.dump(Word.pronunciations(),f,ensure_ascii=False)
    except Exception as e:
        r = redis.Redis(connection_pool=pool)
        r.sadd("notPron", word)
        print(word)


def main():
    global total,pool
    r = redis.Redis(connection_pool=pool)

    needCrawlWordList = []
    downloadFiles = os.listdir("../data/pronunciations/")

    totalFiles = os.listdir("../data/crawl/")

    needCrawlWordListFile = list(set(totalFiles) - set(downloadFiles))

    notFoundWordList = list(r.smembers("notPron"))
    print("notFoundWordList",notFoundWordList)
    for tmp in needCrawlWordListFile:
        needCrawlWordList.append(tmp.replace(".json",""))
    
    needCrawlWordList = list(filter(lambda x: x not in notFoundWordList, needCrawlWordList))    

    random.shuffle(needCrawlWordList)

    wordListLower = list(map(lower,needCrawlWordList))

    threadPool = threadpool.ThreadPool(24)
    requests = threadpool.makeRequests(getAndSearchWord, wordListLower) 
    # [pool.putRequest(req) for req in requests] 
    for req in requests:
        threadPool.putRequest(req)
 
    threadPool.wait()

if __name__ == '__main__':
    main()

6. 百度API翻译例句


评论