#!/usr/bin/env python3 | |
import os | |
import json | |
folder_path = "./vocabs" | |
all_dict = {} | |
def parse_file(filename): | |
dictionary = { | |
"</s>": 2, | |
"<pad>": 0, | |
"<s>": 1, | |
"<unk>": 3, | |
} | |
value = 4 | |
with open(filename, 'r') as file: | |
for line in file: | |
line = line.strip().split() | |
if line: | |
key = line[0] | |
dictionary[key] = value | |
value += 1 | |
return dictionary | |
for filename in os.listdir(folder_path): | |
filepath = os.path.join(folder_path, filename) | |
lang = filename.split(".")[0] | |
if os.path.isfile(filepath): | |
all_dict[lang] = parse_file(filepath) | |
output_path = "vocab.json" # Replace "output.json" with the desired output file path | |
with open(output_path, 'w') as output_file: | |
json.dump(all_dict, output_file, indent=4, sort_keys=True) | |