搜狗输入法词库下载代码

all.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import os
from urllib.request import urlopen
from bs4 import BeautifulSoup
import urllib.request

# 格式:分类名,对应的链接,页数
cates = [
["城市信息", 167, 8],
["自然科学", 1, 28],
["社会科学", 76, 34],
["工程应用", 96, 75],
["农林渔畜", 127, 9],
["医学医药", 132, 32],
["电子游戏", 436, 100],
["艺术设计", 154, 17],
["生活百科", 389, 77],
["运动休闲", 367, 16],
["人文科学", 31, 81],
["娱乐休闲", 403, 101]
]

sets = ['/', '\\', ':', '*', '?', '"', '<', '>', '|'] # windows文件命名不能有这些字符

for cate in cates:
count = 0
os.mkdir("./" + cate[0])
for i in range(1, cate[2] + 1):
html = urlopen("https://pinyin.sogou.com/dict/cate/index/" + str(cate[1]) + "/default/" + str(i))
bsObj = BeautifulSoup(html.read(), "html.parser")
nameList = bsObj.findAll("div", {"class": "detail_title"})
urlList = bsObj.findAll("div", {"class": "dict_dl_btn"})
for name, url in zip(nameList, urlList):
count += 1
name = name.a.get_text()
for char in name:
if char in sets:
name = name.replace(char, "") # 去除windows文件命名中非法的字符
urllib.request.urlretrieve(url.a.attrs['href'], "./" + cate[0] + "/" + str(count) + name + ".scel")
# 文件名加count是因为词库名可能会重复
print(cate[0], count, name)

recommend.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
import os
from urllib.request import urlopen
from bs4 import BeautifulSoup
import urllib.request

# 格式:分类名,对应的链接,页数
cates = [
["城市信息", 167, 8],
["自然科学", 1, 28],
["社会科学", 76, 34],
["工程应用", 96, 75],
["农林渔畜", 127, 9],
["医学医药", 132, 32],
["电子游戏", 436, 100],
["艺术设计", 154, 17],
["生活百科", 389, 77],
["运动休闲", 367, 16],
["人文科学", 31, 81],
["娱乐休闲", 403, 101]
]

sets = ['/', '\\', ':', '*', '?', '"', '<', '>', '|'] # windows文件命名不能有这些字符

for cate in cates:
count = 0
os.mkdir("./" + cate[0] + " 官方推荐")
for i in range(1, cate[2] + 1):
html = urlopen("https://pinyin.sogou.com/dict/cate/index/" + str(cate[1]) + "/default/" + str(i))
bsObj = BeautifulSoup(html.read(), "html.parser")
nameList = bsObj.findAll("div", {"class": "detail_title"})
urlList = bsObj.findAll("div", {"class": "dict_dl_btn"})
for name, url in zip(nameList, urlList):
count += 1
name = name.a.get_text()
if name.find("官方推荐") == -1: # 名字里没有官方推荐就跳过
continue
else:
for char in name:
if char in sets:
name = name.replace(char, "") # 去除windows文件命名中非法的字符
urllib.request.urlretrieve(url.a.attrs['href'], "./" + cate[0] + " 官方推荐" + "/" + str(count) + name + ".scel")
# 文件名加count是因为词库名可能会重复
print(cate[0], count, name)

remove.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
lines_all = []
for i in range(1, 13):
count = 0
with open("./" + str(i) + ".txt", 'r', encoding='UTF-8') as file:
for index, line in enumerate(file):
count += 1
lines_all.append(count)

linesplus = 0
for line in lines_all:
linesplus += line
print(linesplus)

with open("./all.txt", 'a', encoding='UTF-8') as f1:
for i in range(1, 13):
with open("./" + str(i) + ".txt", 'r', encoding='UTF-8') as file:
lines = file.readlines()
for line in lines:
f1.write(line)

with open("./all.txt", 'r', encoding='UTF-8') as f1:
tmp = f1.readlines()
tmp = list(set(tmp)) # 去重
tmp.sort() # 排序
with open("./all_remove.txt", 'a', encoding='UTF-8') as f2:
for v in tmp:
f2.write(v)

成品软件下载:
all.exe
recommend.exe


搜狗输入法词库下载代码
https://roachlin.github.io/2022-09-02-sogou-dict/
作者
RoachLin
发布于
2022年9月2日
许可协议