File size: 2,626 Bytes
932845c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 |
import json
def process_conversations(input_file, invalid_output_file, valid_output_file):
"""
解析 JSONL 文件,检查 `conversations` 列表是否符合条件:
- 必须存在
- 必须为列表
- 列表长度 >= 2
将不符合条件的记录保存到 `invalid_output_file`,
将符合条件的记录保存到 `valid_output_file`。
"""
invalid_records = [] # 用于存储无效记录
valid_records = [] # 用于存储有效记录
with open(input_file, 'r', encoding='utf-8') as infile:
for line_number, line in enumerate(infile, start=1):
try:
# 尝试解析每一行 JSON
data = json.loads(line)
# 检查 `conversations` 是否存在且为非空列表,且长度 >= 2
if isinstance(data.get("conversations"), list) and len(data["conversations"]) >= 2:
valid_records.append(data) # 符合条件的记录
else:
invalid_records.append({
"line_number": line_number,
"data": data # 不符合条件的记录
})
except json.JSONDecodeError as e:
# 捕获 JSON 格式错误
invalid_records.append({
"line_number": line_number,
"error": f"JSONDecodeError: {str(e)}",
"data": line.strip() # 原始数据
})
# 将无效记录写入到无效输出文件
with open(invalid_output_file, 'w', encoding='utf-8') as invalid_file:
json.dump(invalid_records, invalid_file, ensure_ascii=False, indent=4)
# 将符合条件的记录写入到有效输出文件
with open(valid_output_file, 'w', encoding='utf-8') as valid_file:
for record in valid_records:
valid_file.write(json.dumps(record, ensure_ascii=False) + '\n')
# 打印统计信息
print(f"总记录数: {line_number}")
print(f"有效记录数: {len(valid_records)}")
print(f"无效记录数: {len(invalid_records)}")
print(f"无效记录已保存到: {invalid_output_file}")
print(f"有效记录已保存到: {valid_output_file}")
# 示例:指定输入和输出文件路径
input_file = "model5_digg1_safe.jsonl" # 输入的 JSONL 文件路径
invalid_output_file = "invalid_records.json" # 保存无效记录的文件路径
valid_output_file = "valid_records.jsonl" # 保存有效记录的文件路径
# 运行函数
process_conversations(input_file, invalid_output_file, valid_output_file) |