{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from huggingface_hub import HfApi\n", "from huggingface_hub import HfApi\n", "api = HfApi()\n", "# 替换为你的用户名/仓库名\n", "repo_id = \"vincentchao/qllava-next\"\n", "# # 创建仓库(如果还不存在)\n", "api.create_repo(repo_id, repo_type=\"model\", private=False)\n", "\n", "# 上传整个目录\n", "api.upload_folder(\n", " folder_path=\"/common/home/users/w/wzhao/vqclip/qllava_next_newest\",\n", " repo_id=repo_id,\n", " repo_type=\"model\"\n", ")" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "classes2id = { 'neutral':0, 'porn':1,'gun':2,'cigarette':3,'alcohol':4, 'knife':5,'blood':6,'insulting_gesture':7}\n", "id2class = ['neutral','porn','gun','cigarette','alcohol',\"knife\",'blood','insulting_gesture']" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "56dbb817e0a24197b2749a8ff82fe593", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Loading checkpoint shards: 0%| | 0/7 [00:00 1\u001b[0m \u001b[43mmodel\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msave_pretrained\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 2\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m/common/home/users/w/wzhao/vqclip/VQLLMfinal\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\n\u001b[1;32m 3\u001b[0m \u001b[43m)\u001b[49m\n", "File \u001b[0;32m~/.local/lib/python3.12/site-packages/transformers/modeling_utils.py:2971\u001b[0m, in \u001b[0;36mPreTrainedModel.save_pretrained\u001b[0;34m(self, save_directory, is_main_process, state_dict, save_function, push_to_hub, max_shard_size, safe_serialization, variant, token, save_peft_format, **kwargs)\u001b[0m\n\u001b[1;32m 2968\u001b[0m weights_name \u001b[38;5;241m=\u001b[39m ADAPTER_SAFE_WEIGHTS_NAME \u001b[38;5;28;01mif\u001b[39;00m safe_serialization \u001b[38;5;28;01melse\u001b[39;00m ADAPTER_WEIGHTS_NAME\n\u001b[1;32m 2970\u001b[0m filename_pattern \u001b[38;5;241m=\u001b[39m weights_name\u001b[38;5;241m.\u001b[39mreplace(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m.bin\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{suffix}\u001b[39;00m\u001b[38;5;124m.bin\u001b[39m\u001b[38;5;124m\"\u001b[39m)\u001b[38;5;241m.\u001b[39mreplace(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m.safetensors\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{suffix}\u001b[39;00m\u001b[38;5;124m.safetensors\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m-> 2971\u001b[0m state_dict_split \u001b[38;5;241m=\u001b[39m \u001b[43msplit_torch_state_dict_into_shards\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 2972\u001b[0m \u001b[43m \u001b[49m\u001b[43mstate_dict\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfilename_pattern\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfilename_pattern\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmax_shard_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmax_shard_size\u001b[49m\n\u001b[1;32m 2973\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 2974\u001b[0m \u001b[38;5;66;03m# Save index if sharded\u001b[39;00m\n\u001b[1;32m 2975\u001b[0m index \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n", "File \u001b[0;32m~/.local/lib/python3.12/site-packages/huggingface_hub/serialization/_torch.py:369\u001b[0m, in \u001b[0;36msplit_torch_state_dict_into_shards\u001b[0;34m(state_dict, filename_pattern, max_shard_size)\u001b[0m\n\u001b[1;32m 302\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21msplit_torch_state_dict_into_shards\u001b[39m(\n\u001b[1;32m 303\u001b[0m state_dict: Dict[\u001b[38;5;28mstr\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtorch.Tensor\u001b[39m\u001b[38;5;124m\"\u001b[39m],\n\u001b[1;32m 304\u001b[0m \u001b[38;5;241m*\u001b[39m,\n\u001b[1;32m 305\u001b[0m filename_pattern: \u001b[38;5;28mstr\u001b[39m \u001b[38;5;241m=\u001b[39m constants\u001b[38;5;241m.\u001b[39mSAFETENSORS_WEIGHTS_FILE_PATTERN,\n\u001b[1;32m 306\u001b[0m max_shard_size: Union[\u001b[38;5;28mint\u001b[39m, \u001b[38;5;28mstr\u001b[39m] \u001b[38;5;241m=\u001b[39m MAX_SHARD_SIZE,\n\u001b[1;32m 307\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m StateDictSplit:\n\u001b[1;32m 308\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 309\u001b[0m \u001b[38;5;124;03m Split a model state dictionary in shards so that each shard is smaller than a given size.\u001b[39;00m\n\u001b[1;32m 310\u001b[0m \n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 367\u001b[0m \u001b[38;5;124;03m ```\u001b[39;00m\n\u001b[1;32m 368\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m--> 369\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43msplit_state_dict_into_shards_factory\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 370\u001b[0m \u001b[43m \u001b[49m\u001b[43mstate_dict\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 371\u001b[0m \u001b[43m \u001b[49m\u001b[43mmax_shard_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmax_shard_size\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 372\u001b[0m \u001b[43m \u001b[49m\u001b[43mfilename_pattern\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfilename_pattern\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 373\u001b[0m \u001b[43m \u001b[49m\u001b[43mget_storage_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mget_torch_storage_size\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 374\u001b[0m \u001b[43m \u001b[49m\u001b[43mget_storage_id\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mget_torch_storage_id\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 375\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n", "File \u001b[0;32m~/.local/lib/python3.12/site-packages/huggingface_hub/serialization/_base.py:108\u001b[0m, in \u001b[0;36msplit_state_dict_into_shards_factory\u001b[0;34m(state_dict, get_storage_size, filename_pattern, get_storage_id, max_shard_size)\u001b[0m\n\u001b[1;32m 105\u001b[0m \u001b[38;5;28;01mcontinue\u001b[39;00m\n\u001b[1;32m 107\u001b[0m \u001b[38;5;66;03m# If a `tensor` shares the same underlying storage as another tensor, we put `tensor` in the same `block`\u001b[39;00m\n\u001b[0;32m--> 108\u001b[0m storage_id \u001b[38;5;241m=\u001b[39m \u001b[43mget_storage_id\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtensor\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 109\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m storage_id \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 110\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m storage_id \u001b[38;5;129;01min\u001b[39;00m storage_id_to_tensors:\n\u001b[1;32m 111\u001b[0m \u001b[38;5;66;03m# We skip this tensor for now and will reassign to correct shard later\u001b[39;00m\n", "File \u001b[0;32m~/.local/lib/python3.12/site-packages/huggingface_hub/serialization/_torch.py:746\u001b[0m, in \u001b[0;36mget_torch_storage_id\u001b[0;34m(tensor)\u001b[0m\n\u001b[1;32m 735\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mget_torch_storage_id\u001b[39m(tensor: \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtorch.Tensor\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Optional[Tuple[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtorch.device\u001b[39m\u001b[38;5;124m\"\u001b[39m, Union[\u001b[38;5;28mint\u001b[39m, Tuple[Any, \u001b[38;5;241m.\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;241m.\u001b[39m]], \u001b[38;5;28mint\u001b[39m]]:\n\u001b[1;32m 736\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 737\u001b[0m \u001b[38;5;124;03m Return unique identifier to a tensor storage.\u001b[39;00m\n\u001b[1;32m 738\u001b[0m \n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 744\u001b[0m \u001b[38;5;124;03m Taken from https://github.com/huggingface/transformers/blob/1ecf5f7c982d761b4daaa96719d162c324187c64/src/transformers/pytorch_utils.py#L278.\u001b[39;00m\n\u001b[1;32m 745\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m--> 746\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[43mtensor\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdevice\u001b[49m\u001b[38;5;241m.\u001b[39mtype \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmeta\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m 747\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 748\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n", "\u001b[0;31mAttributeError\u001b[0m: 'list' object has no attribute 'device'" ] } ], "source": [ "model.save_pretrained(\n", " \"/common/home/users/w/wzhao/vqclip/VQLLMfinal\"\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "print(type(model))" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([56, 1])" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import numpy as np\n", "np.load(\"/common/home/users/w/wzhao/vqclip/classified_results_llama2/1.npy\")" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "统计结果已保存到字典并导出到:\n", "- JSON: /common/home/users/w/wzhao/vqclip/classification_stats.json\n", "- Pickle: /common/home/users/w/wzhao/vqclip/classification_stats.pkl\n", "\n", "基本统计信息:\n", "总共发现 8 个不同的类别\n", "总共处理了 21213 个元素\n", "\n", "各类别的统计摘要:\n", "类别 0: 包含 5413 个元素,有 43 个不同的元素ID\n", " 出现频率最高的元素: ID 16: 615次, ID 60: 609次, ID 14: 566次\n", "类别 1: 包含 5554 个元素,有 4 个不同的元素ID\n", " 出现频率最高的元素: ID 56: 2067次, ID 7: 1430次, ID 34: 1180次\n", "类别 2: 包含 1473 个元素,有 3 个不同的元素ID\n", " 出现频率最高的元素: ID 13: 778次, ID 24: 488次, ID 23: 207次\n", "类别 3: 包含 2134 个元素,有 4 个不同的元素ID\n", " 出现频率最高的元素: ID 46: 840次, ID 33: 641次, ID 39: 351次\n", "类别 4: 包含 1416 个元素,有 2 个不同的元素ID\n", " 出现频率最高的元素: ID 22: 748次, ID 42: 668次\n", "类别 5: 包含 2785 个元素,有 2 个不同的元素ID\n", " 出现频率最高的元素: ID 52: 1654次, ID 4: 1131次\n", "类别 6: 包含 1723 个元素,有 5 个不同的元素ID\n", " 出现频率最高的元素: ID 20: 555次, ID 57: 481次, ID 17: 350次\n", "类别 7: 包含 715 个元素,有 1 个不同的元素ID\n", " 出现频率最高的元素: ID 40: 715次\n" ] } ], "source": [ "import os\n", "import numpy as np\n", "import json\n", "from collections import defaultdict\n", "import pickle\n", "\n", "# 定义目录路径\n", "directory_path = '/common/home/users/w/wzhao/vqclip/classified_results_llama2'\n", "\n", "# 创建一个嵌套字典,用于存储每个类别中每个元素ID出现的次数\n", "class_element_counts = defaultdict(lambda: defaultdict(int))\n", "# 创建一个字典用于存储每个类别的总计数\n", "class_total_counts = defaultdict(int)\n", "\n", "# 遍历目录中的所有.npy文件\n", "try:\n", " for filename in os.listdir(directory_path):\n", " if filename.endswith('.npy'):\n", " file_path = os.path.join(directory_path, filename)\n", " \n", " # 加载.npy文件\n", " data = np.load(file_path)\n", " \n", " # 确保数据格式正确\n", " if data.size == 2:\n", " element_id = int(data[0]) # 确保是整数\n", " class_id = int(data[1]) # 确保是整数\n", " \n", " # 增加该元素在对应类别中的计数\n", " class_element_counts[class_id][element_id] += 1\n", " # 增加该类别的总计数\n", " class_total_counts[class_id] += 1\n", " else:\n", " print(f\"警告: 文件 {filename} 的数据格式不符合预期,已跳过\")\n", " \n", " # 将defaultdict转换为普通dict以便序列化\n", " result_dict = {\n", " \"class_totals\": dict(class_total_counts),\n", " \"class_elements\": {\n", " class_id: dict(elements) \n", " for class_id, elements in class_element_counts.items()\n", " }\n", " }\n", " \n", " # 保存结果到JSON文件\n", " output_json_path = os.path.join(os.path.dirname(directory_path), \"classification_stats.json\")\n", " with open(output_json_path, 'w') as f:\n", " json.dump(result_dict, f, indent=2)\n", " \n", " # 也保存为Python pickle格式,这样在后续Python处理中更方便\n", " output_pickle_path = os.path.join(os.path.dirname(directory_path), \"classification_stats.pkl\")\n", " with open(output_pickle_path, 'wb') as f:\n", " pickle.dump(result_dict, f)\n", " \n", " # 打印一些基本统计信息\n", " print(f\"统计结果已保存到字典并导出到:\")\n", " print(f\"- JSON: {output_json_path}\")\n", " print(f\"- Pickle: {output_pickle_path}\")\n", " print(\"\\n基本统计信息:\")\n", " print(f\"总共发现 {len(class_total_counts)} 个不同的类别\")\n", " total_elements = sum(class_total_counts.values())\n", " print(f\"总共处理了 {total_elements} 个元素\")\n", " \n", " # 打印每个类别的样本统计\n", " print(\"\\n各类别的统计摘要:\")\n", " for class_id in sorted(class_total_counts.keys()):\n", " print(f\"类别 {class_id}: 包含 {class_total_counts[class_id]} 个元素,有 {len(class_element_counts[class_id])} 个不同的元素ID\")\n", " \n", " # 获取该类别中出现次数最多的3个元素\n", " sorted_elements = sorted(class_element_counts[class_id].items(), \n", " key=lambda x: x[1], reverse=True)[:3]\n", " \n", " # 打印这些元素及其出现次数\n", " print(f\" 出现频率最高的元素: \" + \", \".join([f\"ID {e_id}: {count}次\" for e_id, count in sorted_elements]))\n", "\n", "except Exception as e:\n", " print(f\"发生错误: {e}\")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.4" } }, "nbformat": 4, "nbformat_minor": 2 }