json 文件预处理

join_publishers.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
# publisher总去重排序
# 4_publishers文件夹

import os
import time

path = "D:/Open Academic Graph v2/mag_papers/papers/4_publishers/"
files = os.listdir(path)

with open(path + "publishers.txt", 'a', encoding='UTF-8') as f1:
for f in files:
start = time.perf_counter()

with open(path + f, 'r', encoding='UTF-8') as file_object:
lines = file_object.readlines()
for line in lines:
f1.write(line)

end = time.perf_counter()
print(f, end - start)

start = time.perf_counter()

with open(path + "publishers.txt", 'r', encoding='UTF-8') as f1:
publishers = f1.readlines()
publishers = list(set(publishers)) # 去重
publishers.sort() # 排序
with open(path + "1_publishers.txt", 'a', encoding='UTF-8') as f2:
for v in publishers:
f2.write(v)

end = time.perf_counter()
print("final", end - start)

join_venues.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
# venue总去重排序
# 4_venues

import os
import time

path = "D:/Open Academic Graph v2/mag_papers/papers/4_venues/"
files = os.listdir(path)

with open(path + "venues.txt", 'a', encoding='UTF-8') as f1:
for f in files:
start = time.perf_counter()

with open(path + f, 'r', encoding='UTF-8') as file_object:
lines = file_object.readlines()
for line in lines:
f1.write(line)

end = time.perf_counter()
print(f, end - start)

start = time.perf_counter()

with open(path + "venues.txt", 'r', encoding='UTF-8') as f1:
venues = f1.readlines()
venues = list(set(venues)) # 去重
venues.sort() # 排序
with open(path + "1_venues.txt", 'a', encoding='UTF-8') as f2:
for v in venues:
f2.write(v)

end = time.perf_counter()
print("final", end - start)

lines.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
# 统计文件行数

import os
import time

path = "D:/Open Academic Graph v2/mag_papers/papers/3_publish/"
files = os.listdir(path)

for f in files:
start = time.perf_counter()

count = 0
with open(path + f, 'r', encoding='UTF-8') as file:
for index, line in enumerate(file):
count += 1

end = time.perf_counter()
print(f, count, end - start, "s")

publish.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
# 去除venue和publisher都没有的项
# 3_publish文件夹

import json
import os
import time

path = "D:/Open Academic Graph v2/mag_papers/papers/2_year/"
files = os.listdir(path)

for f in files:
start = time.perf_counter()

with open(path + f, 'r', encoding='UTF-8') as file_object:
lines = file_object.readlines()
with open(path + "publish_" + f, 'a', encoding='UTF-8') as f1, \
open(path + "no_" + f, 'a', encoding='UTF-8') as f2:
for line in lines:
line = eval(line) # str转dict
try:
if line["venue"]:
f1.write(json.dumps(line)) # dict转str
f1.write('\n')
except KeyError:
try:
if line["publisher"] == "":
f2.write(json.dumps(line))
f2.write('\n')
else:
f1.write(json.dumps(line))
f1.write('\n')
except KeyError:
f2.write(json.dumps(line))
f2.write('\n')

end = time.perf_counter()
print(f, end - start)

txt2json.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
# txt转json
# 在navicat mongodb中使用

import time


def txt2json(s, name):
with open(s + name + ".txt", 'r', encoding='UTF-8') as file_object:
lines = file_object.readlines()
with open(s + name + "_json.txt", 'w', encoding='UTF-8') as file:
file.write('[\n')
for line in lines:
file.write(line.rstrip() + ',\n')

# 删掉最后一个空行和逗号
with open(s + name + "_json.txt", 'rb+') as file:
file.seek(-3, 2)
file.truncate()

with open(s + name + "_json.txt", 'a', encoding='UTF-8') as file:
file.write('\n]')


path = "C:/Users/Lin/Desktop/"
lists = ["publish_1950_mag_papers_10"]
for i in lists:
start = time.perf_counter()
txt2json(path, i)
end = time.perf_counter()
print(i, end - start)

venues_publishers.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
# 统计venue和publisher的所有类型,并去重排序
# 3_publish文件夹
# 4_publishers文件夹

import json
import os
import time

path = "D:/Open Academic Graph v2/mag_papers/papers/3_publish/"
files = os.listdir(path)

for f in files:
start = time.perf_counter()

with open(path + f, 'r', encoding='UTF-8') as file_object:
lines = file_object.readlines()
with open(path + "venues_" + f, 'a', encoding='UTF-8') as f1, \
open(path + "publishers_" + f, 'a', encoding='UTF-8') as f2:
for line in lines:
line = eval(line) # str转dict
try:
if line["venue"] != "":
f1.write(json.dumps(line["venue"]["raw"]))
f1.write('\n')
except KeyError:
pass

try:
if line["publisher"] != "":
f2.write(json.dumps(line["publisher"]))
f2.write('\n')
except KeyError:
pass

end = time.perf_counter()
print(f, end - start)

year.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
# 筛选出1950年后的,1950年前的,没有年份的
# 2_year文件夹

import json
import time

path = "D:/Open Academic Graph v2/papers/"

for i in range(11):
start = time.perf_counter()

with open(path + "mag_papers_" + str(i) + ".txt", 'r', encoding='UTF-8') as file_object:
lines = file_object.readlines()
with open(path + "year/1950_mag_papers_" + str(i) + ".txt", 'a', encoding='UTF-8') as f1, \
open(path + "year/other_mag_papers_" + str(i) + ".txt", 'a', encoding='UTF-8') as f2, \
open(path + "year/no_mag_papers_" + str(i) + ".txt", 'a', encoding='UTF-8') as f3:
for line in lines:
line = eval(line) # str转dict
try:
if line["year"] >= 1950:
f1.write(json.dumps(line)) # dict转str
f1.write('\n')
else:
f2.write(json.dumps(line))
f2.write('\n')
except KeyError:
f3.write(json.dumps(line))
f3.write('\n')

end = time.perf_counter()
print(i, end - start)

json 文件预处理
https://roachlin.github.io/2021-12-02-json-deal/
作者
RoachLin
发布于
2021年12月2日
许可协议