|
import json |
|
|
|
def process_conversations(input_file, invalid_output_file, valid_output_file): |
|
""" |
|
解析 JSONL 文件,检查 `conversations` 列表是否符合条件: |
|
- 必须存在 |
|
- 必须为列表 |
|
- 列表长度 >= 2 |
|
将不符合条件的记录保存到 `invalid_output_file`, |
|
将符合条件的记录保存到 `valid_output_file`。 |
|
""" |
|
invalid_records = [] |
|
valid_records = [] |
|
|
|
with open(input_file, 'r', encoding='utf-8') as infile: |
|
for line_number, line in enumerate(infile, start=1): |
|
try: |
|
|
|
data = json.loads(line) |
|
|
|
|
|
if isinstance(data.get("conversations"), list) and len(data["conversations"]) >= 2: |
|
valid_records.append(data) |
|
else: |
|
invalid_records.append({ |
|
"line_number": line_number, |
|
"data": data |
|
}) |
|
except json.JSONDecodeError as e: |
|
|
|
invalid_records.append({ |
|
"line_number": line_number, |
|
"error": f"JSONDecodeError: {str(e)}", |
|
"data": line.strip() |
|
}) |
|
|
|
|
|
with open(invalid_output_file, 'w', encoding='utf-8') as invalid_file: |
|
json.dump(invalid_records, invalid_file, ensure_ascii=False, indent=4) |
|
|
|
|
|
with open(valid_output_file, 'w', encoding='utf-8') as valid_file: |
|
for record in valid_records: |
|
valid_file.write(json.dumps(record, ensure_ascii=False) + '\n') |
|
|
|
|
|
print(f"总记录数: {line_number}") |
|
print(f"有效记录数: {len(valid_records)}") |
|
print(f"无效记录数: {len(invalid_records)}") |
|
print(f"无效记录已保存到: {invalid_output_file}") |
|
print(f"有效记录已保存到: {valid_output_file}") |
|
|
|
|
|
|
|
input_file = "model5_digg1_safe.jsonl" |
|
invalid_output_file = "invalid_records.json" |
|
valid_output_file = "valid_records.jsonl" |
|
|
|
|
|
process_conversations(input_file, invalid_output_file, valid_output_file) |