json 文件预处理 - RoachLin's blog

join_publishers.py

# publisher总去重排序
# 4_publishers文件夹

import os
import time

path = "D:/Open Academic Graph v2/mag_papers/papers/4_publishers/"
files = os.listdir(path)

with open(path + "publishers.txt", 'a', encoding='UTF-8') as f1:
    for f in files:
        start = time.perf_counter()

        with open(path + f, 'r', encoding='UTF-8') as file_object:
            lines = file_object.readlines()
            for line in lines:
                f1.write(line)

        end = time.perf_counter()
        print(f, end - start)

start = time.perf_counter()

with open(path + "publishers.txt", 'r', encoding='UTF-8') as f1:
    publishers = f1.readlines()
    publishers = list(set(publishers))  # 去重
    publishers.sort()  # 排序
    with open(path + "1_publishers.txt", 'a', encoding='UTF-8') as f2:
        for v in publishers:
            f2.write(v)

end = time.perf_counter()
print("final", end - start)

join_venues.py

# venue总去重排序
# 4_venues

import os
import time

path = "D:/Open Academic Graph v2/mag_papers/papers/4_venues/"
files = os.listdir(path)

with open(path + "venues.txt", 'a', encoding='UTF-8') as f1:
    for f in files:
        start = time.perf_counter()

        with open(path + f, 'r', encoding='UTF-8') as file_object:
            lines = file_object.readlines()
            for line in lines:
                f1.write(line)

        end = time.perf_counter()
        print(f, end - start)

start = time.perf_counter()

with open(path + "venues.txt", 'r', encoding='UTF-8') as f1:
    venues = f1.readlines()
    venues = list(set(venues))  # 去重
    venues.sort()  # 排序
    with open(path + "1_venues.txt", 'a', encoding='UTF-8') as f2:
        for v in venues:
            f2.write(v)

end = time.perf_counter()
print("final", end - start)

lines.py

# 统计文件行数

import os
import time

path = "D:/Open Academic Graph v2/mag_papers/papers/3_publish/"
files = os.listdir(path)

for f in files:
    start = time.perf_counter()

    count = 0
    with open(path + f, 'r', encoding='UTF-8') as file:
        for index, line in enumerate(file):
            count += 1

    end = time.perf_counter()
    print(f, count, end - start, "s")

publish.py

# 去除venue和publisher都没有的项
# 3_publish文件夹

import json
import os
import time

path = "D:/Open Academic Graph v2/mag_papers/papers/2_year/"
files = os.listdir(path)

for f in files:
    start = time.perf_counter()

    with open(path + f, 'r', encoding='UTF-8') as file_object:
        lines = file_object.readlines()
        with open(path + "publish_" + f, 'a', encoding='UTF-8') as f1, \
                open(path + "no_" + f, 'a', encoding='UTF-8') as f2:
            for line in lines:
                line = eval(line)  # str转dict
                try:
                    if line["venue"]:
                        f1.write(json.dumps(line))  # dict转str
                        f1.write('\n')
                except KeyError:
                    try:
                        if line["publisher"] == "":
                            f2.write(json.dumps(line))
                            f2.write('\n')
                        else:
                            f1.write(json.dumps(line))
                            f1.write('\n')
                    except KeyError:
                        f2.write(json.dumps(line))
                        f2.write('\n')

    end = time.perf_counter()
    print(f, end - start)

txt2json.py

# txt转json
# 在navicat mongodb中使用

import time


def txt2json(s, name):
    with open(s + name + ".txt", 'r', encoding='UTF-8') as file_object:
        lines = file_object.readlines()
        with open(s + name + "_json.txt", 'w', encoding='UTF-8') as file:
            file.write('[\n')
            for line in lines:
                file.write(line.rstrip() + ',\n')

    # 删掉最后一个空行和逗号
    with open(s + name + "_json.txt", 'rb+') as file:
        file.seek(-3, 2)
        file.truncate()

    with open(s + name + "_json.txt", 'a', encoding='UTF-8') as file:
        file.write('\n]')


path = "C:/Users/Lin/Desktop/"
lists = ["publish_1950_mag_papers_10"]
for i in lists:
    start = time.perf_counter()
    txt2json(path, i)
    end = time.perf_counter()
    print(i, end - start)

venues_publishers.py

# 统计venue和publisher的所有类型，并去重排序
# 3_publish文件夹
# 4_publishers文件夹

import json
import os
import time

path = "D:/Open Academic Graph v2/mag_papers/papers/3_publish/"
files = os.listdir(path)

for f in files:
    start = time.perf_counter()

    with open(path + f, 'r', encoding='UTF-8') as file_object:
        lines = file_object.readlines()
        with open(path + "venues_" + f, 'a', encoding='UTF-8') as f1, \
                open(path + "publishers_" + f, 'a', encoding='UTF-8') as f2:
            for line in lines:
                line = eval(line)  # str转dict
                try:
                    if line["venue"] != "":
                        f1.write(json.dumps(line["venue"]["raw"]))
                        f1.write('\n')
                except KeyError:
                    pass

                try:
                    if line["publisher"] != "":
                        f2.write(json.dumps(line["publisher"]))
                        f2.write('\n')
                except KeyError:
                    pass

    end = time.perf_counter()
    print(f, end - start)

year.py

# 筛选出1950年后的，1950年前的，没有年份的
# 2_year文件夹

import json
import time

path = "D:/Open Academic Graph v2/papers/"

for i in range(11):
    start = time.perf_counter()

    with open(path + "mag_papers_" + str(i) + ".txt", 'r', encoding='UTF-8') as file_object:
        lines = file_object.readlines()
        with open(path + "year/1950_mag_papers_" + str(i) + ".txt", 'a', encoding='UTF-8') as f1, \
                open(path + "year/other_mag_papers_" + str(i) + ".txt", 'a', encoding='UTF-8') as f2, \
                open(path + "year/no_mag_papers_" + str(i) + ".txt", 'a', encoding='UTF-8') as f3:
            for line in lines:
                line = eval(line)  # str转dict
                try:
                    if line["year"] >= 1950:
                        f1.write(json.dumps(line))  # dict转str
                        f1.write('\n')
                    else:
                        f2.write(json.dumps(line))
                        f2.write('\n')
                except KeyError:
                    f3.write(json.dumps(line))
                    f3.write('\n')

    end = time.perf_counter()
    print(i, end - start)

json 文件预处理

https://roachlin.github.io/2021-12-02-json-deal/

作者

RoachLin

发布于

2021年12月2日

许可协议

初中电脑课本人脸识别上一篇

2018年刑侦科推理题——代码解下一篇