File size: 3,398 Bytes
7177c13 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 |
---
language:
- he
tags:
- language model
pipeline_tag: feature-extraction
---
## AlephBertGimmel
Modern Hebrew pretrained BERT model with a 128K token vocabulary.
[Checkpoint](https://github.com/Dicta-Israel-Center-for-Text-Analysis/alephbertgimmel/tree/main/alephbertgimmel-small/ckpt_29400--Max128Seq) of the alephbertgimmel-small-128 from [alephbertgimmel](https://github.com/Dicta-Israel-Center-for-Text-Analysis/alephbertgimmel)
```python
from transformers import AutoTokenizer, AutoModelForMaskedLM
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
model = AutoModelForMaskedLM.from_pretrained("imvladikon/alephbertgimmel-small-128")
tokenizer = AutoTokenizer.from_pretrained("imvladikon/alephbertgimmel-small-128")
text = "{} 讛讬讗 诪讟专讜驻讜诇讬谉 讛诪讛讜讜讛 讗转 诪专讻讝 讛讻诇讻诇讛"
input = tokenizer.encode(text.format("[MASK]"), return_tensors="pt")
mask_token_index = torch.where(input == tokenizer.mask_token_id)[1]
token_logits = model(input).logits
mask_token_logits = token_logits[0, mask_token_index, :]
top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()
for token in top_5_tokens:
print(text.format(tokenizer.decode([token])))
# 讬砖专讗诇 讛讬讗 诪讟专讜驻讜诇讬谉 讛诪讛讜讜讛 讗转 诪专讻讝 讛讻诇讻诇讛
# 讬专讜砖诇讬诐 讛讬讗 诪讟专讜驻讜诇讬谉 讛诪讛讜讜讛 讗转 诪专讻讝 讛讻诇讻诇讛
# 讞讬驻讛 讛讬讗 诪讟专讜驻讜诇讬谉 讛诪讛讜讜讛 讗转 诪专讻讝 讛讻诇讻诇讛
# 讗讬诇转 讛讬讗 诪讟专讜驻讜诇讬谉 讛诪讛讜讜讛 讗转 诪专讻讝 讛讻诇讻诇讛
# 讗砖讚讜讚 讛讬讗 诪讟专讜驻讜诇讬谉 讛诪讛讜讜讛 讗转 诪专讻讝 讛讻诇讻诇讛
```
```python
def ppl_naive(text, model, tokenizer):
input = tokenizer.encode(text, return_tensors="pt")
loss = model(input, labels=input)[0]
return torch.exp(loss).item()
text = """{} 讛讬讗 注讬专 讛讘讬专讛 砖诇 诪讚讬谞转 讬砖专讗诇, 讜讛注讬专 讛讙讚讜诇讛 讘讬讜转专 讘讬砖专讗诇 讘讙讜讚诇 讛讗讜讻诇讜住讬讬讛"""
for word in ["讞讬驻讛", "讬专讜砖诇讬诐", "转诇 讗讘讬讘"]:
print(ppl_naive(text.format(word), model, tokenizer))
# 9.825098991394043
# 10.594215393066406
# 9.536449432373047
# I'd expect that for "讬专讜砖诇讬诐" should be the smallest value, but...
@torch.inference_mode()
def ppl_pseudo(text, model, tokenizer, ignore_idx=-100):
input = tokenizer.encode(text, return_tensors='pt')
mask = torch.ones(input.size(-1) - 1).diag(1)[:-2]
repeat_input = input.repeat(input.size(-1) - 2, 1)
input = repeat_input.masked_fill(mask == 1, tokenizer.mask_token_id)
labels = repeat_input.masked_fill(input != tokenizer.mask_token_id, ignore_idx)
loss = model(input, labels=labels)[0]
return torch.exp(loss).item()
for word in ["讞讬驻讛", "讬专讜砖诇讬诐", "转诇 讗讘讬讘"]:
print(ppl_pseudo(text.format(word), model, tokenizer))
# 4.346900939941406
# 3.292382001876831
# 2.732590913772583
```
When using AlephBertGimmel, please reference:
```bibtex
@misc{guetta2022large,
title={Large Pre-Trained Models with Extra-Large Vocabularies: A Contrastive Analysis of Hebrew BERT Models and a New One to Outperform Them All},
author={Eylon Guetta and Avi Shmidman and Shaltiel Shmidman and Cheyn Shmuel Shmidman and Joshua Guedalia and Moshe Koppel and Dan Bareket and Amit Seker and Reut Tsarfaty},
year={2022},
eprint={2211.15199},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
``` |