daman043 commited on
Commit
99ef2b1
·
1 Parent(s): c4ef4df

Upload 9 files

Browse files
.gitattributes CHANGED
@@ -1,34 +1,17 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
  *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
  *.h5 filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
9
  *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
  *.model filter=lfs diff=lfs merge=lfs -text
13
  *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
  *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
  *.pt filter=lfs diff=lfs merge=lfs -text
23
  *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tflite filter=lfs diff=lfs merge=lfs -text
29
- *.tgz filter=lfs diff=lfs merge=lfs -text
30
- *.wasm filter=lfs diff=lfs merge=lfs -text
31
- *.xz filter=lfs diff=lfs merge=lfs -text
32
- *.zip filter=lfs diff=lfs merge=lfs -text
33
- *.zst filter=lfs diff=lfs merge=lfs -text
34
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
1
+ *.bin.* filter=lfs diff=lfs merge=lfs -text
2
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
3
  *.bin filter=lfs diff=lfs merge=lfs -text
 
 
 
 
4
  *.h5 filter=lfs diff=lfs merge=lfs -text
5
+ *.tflite filter=lfs diff=lfs merge=lfs -text
6
+ *.tar.gz filter=lfs diff=lfs merge=lfs -text
7
+ *.ot filter=lfs diff=lfs merge=lfs -text
8
+ *.onnx filter=lfs diff=lfs merge=lfs -text
9
+ *.arrow filter=lfs diff=lfs merge=lfs -text
10
+ *.ftz filter=lfs diff=lfs merge=lfs -text
11
  *.joblib filter=lfs diff=lfs merge=lfs -text
 
 
12
  *.model filter=lfs diff=lfs merge=lfs -text
13
  *.msgpack filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
14
  *.pb filter=lfs diff=lfs merge=lfs -text
 
 
15
  *.pt filter=lfs diff=lfs merge=lfs -text
16
  *.pth filter=lfs diff=lfs merge=lfs -text
17
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
README.md ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language: zh
3
+ widget:
4
+ - text: "江苏警方通报特斯拉冲进店铺"
5
+
6
+ ---
7
+
8
+ # Chinese RoBERTa-Base Model for NER
9
+
10
+ ## Model description
11
+
12
+ The model is used for named entity recognition. You can download the model either from the [UER-py Modelzoo page](https://github.com/dbiir/UER-py/wiki/Modelzoo) (in UER-py format), or via HuggingFace from the link [roberta-base-finetuned-cluener2020-chinese](https://huggingface.co/uer/roberta-base-finetuned-cluener2020-chinese).
13
+
14
+ ## How to use
15
+
16
+ You can use this model directly with a pipeline for token classification :
17
+
18
+ ```python
19
+ >>> from transformers import AutoModelForTokenClassification,AutoTokenizer,pipeline
20
+ >>> model = AutoModelForTokenClassification.from_pretrained('uer/roberta-base-finetuned-cluener2020-chinese')
21
+ >>> tokenizer = AutoTokenizer.from_pretrained('uer/roberta-base-finetuned-cluener2020-chinese')
22
+ >>> ner = pipeline('ner', model=model, tokenizer=tokenizer)
23
+ >>> ner("江苏警方通报特斯拉冲进店铺")
24
+ [
25
+ {'word': '江', 'score': 0.49153077602386475, 'entity': 'B-address', 'index': 1, 'start': 0, 'end': 1},
26
+ {'word': '苏', 'score': 0.6319217681884766, 'entity': 'I-address', 'index': 2, 'start': 1, 'end': 2},
27
+ {'word': '特', 'score': 0.5912262797355652, 'entity': 'B-company', 'index': 7, 'start': 6, 'end': 7},
28
+ {'word': '斯', 'score': 0.69145667552948, 'entity': 'I-company', 'index': 8, 'start': 7, 'end': 8},
29
+ {'word': '拉', 'score': 0.7054660320281982, 'entity': 'I-company', 'index': 9, 'start': 8, 'end': 9}
30
+ ]
31
+ ```
32
+
33
+ ## Training data
34
+
35
+ [CLUENER2020](https://github.com/CLUEbenchmark/CLUENER2020) is used as training data. We only use the train set of the dataset.
36
+
37
+ ## Training procedure
38
+
39
+ The model is fine-tuned by [UER-py](https://github.com/dbiir/UER-py/) on [Tencent Cloud](https://cloud.tencent.com/). We fine-tune five epochs with a sequence length of 512 on the basis of the pre-trained model [chinese_roberta_L-12_H-768](https://huggingface.co/uer/chinese_roberta_L-12_H-768). At the end of each epoch, the model is saved when the best performance on development set is achieved.
40
+
41
+ ```
42
+ python3 run_ner.py --pretrained_model_path models/cluecorpussmall_roberta_base_seq512_model.bin-250000 \
43
+ --vocab_path models/google_zh_vocab.txt \
44
+ --train_path datasets/cluener2020/train.tsv \
45
+ --dev_path datasets/cluener2020/dev.tsv \
46
+ --label2id_path datasets/cluener2020/label2id.json \
47
+ --output_model_path models/cluener2020_ner_model.bin \
48
+ --learning_rate 3e-5 --epochs_num 5 --batch_size 32 --seq_length 512
49
+ ```
50
+
51
+ Finally, we convert the pre-trained model into Huggingface's format:
52
+
53
+ ```
54
+ python3 scripts/convert_bert_token_classification_from_uer_to_huggingface.py --input_model_path models/cluener2020_ner_model.bin \
55
+ --output_model_path pytorch_model.bin \
56
+ --layers_num 12
57
+ ```
58
+
59
+ ### BibTeX entry and citation info
60
+
61
+ ```
62
+ @article{devlin2018bert,
63
+ title={BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding},
64
+ author={Devlin, Jacob and Chang, Ming-Wei and Lee, Kenton and Toutanova, Kristina},
65
+ journal={arXiv preprint arXiv:1810.04805},
66
+ year={2018}
67
+ }
68
+
69
+ @article{liu2019roberta,
70
+ title={Roberta: A robustly optimized bert pretraining approach},
71
+ author={Liu, Yinhan and Ott, Myle and Goyal, Naman and Du, Jingfei and Joshi, Mandar and Chen, Danqi and Levy, Omer and Lewis, Mike and Zettlemoyer, Luke and Stoyanov, Veselin},
72
+ journal={arXiv preprint arXiv:1907.11692},
73
+ year={2019}
74
+ }
75
+
76
+ @article{xu2020cluener2020,
77
+ title={CLUENER2020: Fine-grained Name Entity Recognition for Chinese},
78
+ author={Xu, Liang and Dong, Qianqian and Yu, Cong and Tian, Yin and Liu, Weitang and Li, Lu and Zhang, Xuanwei},
79
+ journal={arXiv preprint arXiv:2001.04351},
80
+ year={2020}
81
+ }
82
+
83
+ @article{zhao2019uer,
84
+ title={UER: An Open-Source Toolkit for Pre-training Models},
85
+ author={Zhao, Zhe and Chen, Hui and Zhang, Jinbin and Zhao, Xin and Liu, Tao and Lu, Wei and Chen, Xi and Deng, Haotang and Ju, Qi and Du, Xiaoyong},
86
+ journal={EMNLP-IJCNLP 2019},
87
+ pages={241},
88
+ year={2019}
89
+ }
90
+ ```
config.json ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "BertForTokenClassification"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "hidden_act": "gelu",
7
+ "hidden_dropout_prob": 0.1,
8
+ "hidden_size": 768,
9
+ "id2label": {
10
+ "0": "O",
11
+ "1": "B-address",
12
+ "2": "I-address",
13
+ "3": "B-book",
14
+ "4": "I-book",
15
+ "5": "B-company",
16
+ "6": "I-company",
17
+ "7": "B-game",
18
+ "8": "I-game",
19
+ "9": "B-government",
20
+ "10": "I-government",
21
+ "11": "B-movie",
22
+ "12": "I-movie",
23
+ "13": "B-name",
24
+ "14": "I-name",
25
+ "15": "B-organization",
26
+ "16": "I-organization",
27
+ "17": "B-position",
28
+ "18": "I-position",
29
+ "19": "B-scene",
30
+ "20": "I-scene",
31
+ "21": "S-address",
32
+ "22": "S-book",
33
+ "23": "S-company",
34
+ "24": "S-game",
35
+ "25": "S-government",
36
+ "26": "S-movie",
37
+ "27": "S-name",
38
+ "28": "S-organization",
39
+ "29": "S-position",
40
+ "30": "S-scene",
41
+ "31": "[PAD]"
42
+ },
43
+ "initializer_range": 0.02,
44
+ "intermediate_size": 3072,
45
+ "label2id": {
46
+ "B-address": 1,
47
+ "B-book": 3,
48
+ "B-company": 5,
49
+ "B-game": 7,
50
+ "B-government": 9,
51
+ "B-movie": 11,
52
+ "B-name": 13,
53
+ "B-organization": 15,
54
+ "B-position": 17,
55
+ "B-scene": 19,
56
+ "I-address": 2,
57
+ "I-book": 4,
58
+ "I-company": 6,
59
+ "I-game": 8,
60
+ "I-government": 10,
61
+ "I-movie": 12,
62
+ "I-name": 14,
63
+ "I-organization": 16,
64
+ "I-position": 18,
65
+ "I-scene": 20,
66
+ "O": 0,
67
+ "S-address": 21,
68
+ "S-book": 22,
69
+ "S-company": 23,
70
+ "S-game": 24,
71
+ "S-government": 25,
72
+ "S-movie": 26,
73
+ "S-name": 27,
74
+ "S-organization": 28,
75
+ "S-position": 29,
76
+ "S-scene": 30,
77
+ "[PAD]": 31
78
+ },
79
+ "layer_norm_eps": 1e-12,
80
+ "max_position_embeddings": 512,
81
+ "model_type": "bert",
82
+ "num_attention_heads": 12,
83
+ "num_hidden_layers": 12,
84
+ "pad_token_id": 0,
85
+ "vocab_size": 21128
86
+ }
flax_model.msgpack ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3fcc5a4fdb2a83463bf4f65ea0d60e3ecb31963cf85f4f513f3b48150b522b57
3
+ size 406813802
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b865252516115c46bc508167fa2258f198bcce520eb7a1ac4fbf8d50dc361368
3
+ size 406892015
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}
tf_model.h5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b129d0ae906c9579f8d118d9a84df27304676d6e8822389c776e8ea33423feb5
3
+ size 407074912
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"do_lower_case": true, "do_basic_tokenize": true, "never_split": null, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "special_tokens_map_file": null, "tokenizer_file": null}
vocab.txt ADDED
The diff for this file is too large to render. See raw diff