1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43
| import os from urllib.request import urlopen from bs4 import BeautifulSoup import urllib.request
cates = [ ["城市信息", 167, 8], ["自然科学", 1, 28], ["社会科学", 76, 34], ["工程应用", 96, 75], ["农林渔畜", 127, 9], ["医学医药", 132, 32], ["电子游戏", 436, 100], ["艺术设计", 154, 17], ["生活百科", 389, 77], ["运动休闲", 367, 16], ["人文科学", 31, 81], ["娱乐休闲", 403, 101] ]
sets = ['/', '\\', ':', '*', '?', '"', '<', '>', '|']
for cate in cates: count = 0 os.mkdir("./" + cate[0] + " 官方推荐") for i in range(1, cate[2] + 1): html = urlopen("https://pinyin.sogou.com/dict/cate/index/" + str(cate[1]) + "/default/" + str(i)) bsObj = BeautifulSoup(html.read(), "html.parser") nameList = bsObj.findAll("div", {"class": "detail_title"}) urlList = bsObj.findAll("div", {"class": "dict_dl_btn"}) for name, url in zip(nameList, urlList): count += 1 name = name.a.get_text() if name.find("官方推荐") == -1: continue else: for char in name: if char in sets: name = name.replace(char, "") urllib.request.urlretrieve(url.a.attrs['href'], "./" + cate[0] + " 官方推荐" + "/" + str(count) + name + ".scel") print(cate[0], count, name)
|