Commit
·
81b1588
1
Parent(s):
d09f022
Create README.md
Browse files
README.md
ADDED
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
This repo contains the fully trained ByT5 that was used to estimate per-character entropies. Using it, you can also recreate the illustration in the paper.
|
2 |
+
|
3 |
+
|
4 |
+
```python
|
5 |
+
from huggingface_hub import snapshot_download
|
6 |
+
data_folder = snapshot_download("fxtentacle/tevr-token-entropy-predictor-de")
|
7 |
+
```
|
8 |
+
|
9 |
+
|
10 |
+
```python
|
11 |
+
from transformers import T5ForConditionalGeneration
|
12 |
+
model = T5ForConditionalGeneration.from_pretrained(data_folder)
|
13 |
+
model.to('cuda')
|
14 |
+
model.eval()
|
15 |
+
None
|
16 |
+
```
|
17 |
+
|
18 |
+
|
19 |
+
```python
|
20 |
+
import torch
|
21 |
+
|
22 |
+
def text_to_cross_entropy(text):
|
23 |
+
ttext = torch.tensor([[0]+list(text.encode('UTF-8'))],dtype=torch.int64).to('cuda')
|
24 |
+
tone = torch.tensor([[1]],dtype=torch.int32).to('cuda')
|
25 |
+
logits = model.forward(input_ids=tone, attention_mask=tone, decoder_input_ids=ttext, return_dict=False)[0].detach()
|
26 |
+
cross_entropy = torch.nn.functional.cross_entropy(input=logits[0][:-1], target=ttext[0][1:], reduction='none').detach().cpu().numpy()
|
27 |
+
return cross_entropy
|
28 |
+
```
|
29 |
+
|
30 |
+
|
31 |
+
```python
|
32 |
+
import sys
|
33 |
+
import os
|
34 |
+
sys.path.append(data_folder)
|
35 |
+
from text_tokenizer import HajoTextTokenizer
|
36 |
+
```
|
37 |
+
|
38 |
+
|
39 |
+
```python
|
40 |
+
tokenizer_file = 'text-tokenizer-de-4m.txt'
|
41 |
+
text_tokenizer = HajoTextTokenizer(data_folder+'/'+tokenizer_file)
|
42 |
+
```
|
43 |
+
|
44 |
+
|
45 |
+
```python
|
46 |
+
text = "die katze ist niedlich"
|
47 |
+
cross_entropy = text_to_cross_entropy(text)
|
48 |
+
|
49 |
+
tokens = text_tokenizer.encode(text)
|
50 |
+
tokens = [text_tokenizer.all_tokens[t] for t in tokens]
|
51 |
+
print(tokens)
|
52 |
+
token_sums = []
|
53 |
+
token_sums2 = []
|
54 |
+
for t in tokens:
|
55 |
+
ce = sum(cross_entropy[len(token_sums):len(token_sums)+len(t)])
|
56 |
+
for r in range(len(t)): token_sums.append(ce / len(t))
|
57 |
+
token_sums2.append(ce)
|
58 |
+
print(token_sums)
|
59 |
+
```
|
60 |
+
|
61 |
+
['die', ' ', 'k', 'at', 'ze', ' ', 'ist', ' ', 'n', 'ied', 'lich']
|
62 |
+
[3.3762913048267365, 3.3762913048267365, 3.3762913048267365, 0.29695791006088257, 4.193424224853516, 2.3430762887001038, 2.3430762887001038, 2.8417416363954544, 2.8417416363954544, 1.1227068901062012, 2.017452405144771, 2.017452405144771, 2.017452405144771, 0.0016304069431498647, 2.580254554748535, 2.3091587026913962, 2.3091587026913962, 2.3091587026913962, 1.0126478232632508, 1.0126478232632508, 1.0126478232632508, 1.0126478232632508]
|
63 |
+
|
64 |
+
|
65 |
+
|
66 |
+
```python
|
67 |
+
import numpy as np
|
68 |
+
html = '<table style="font-size: 20px; font-family: Roboto">'
|
69 |
+
html += '<tr><td><b>(1)</b></td>'+''.join([f'<td style="text-align:left">{c}</td>' for c in list(text)])+'</tr>'
|
70 |
+
html += '<tr><td><b>(2)</b></td>'+''.join(['<td>1.0</td>'.format(v) for v in cross_entropy])+'<td>σ²={:3.1f}</td>'.format(np.var([1.0 for v in cross_entropy]))+'</tr>'
|
71 |
+
html += '<tr><td><b>(3)</b></td>'+''.join(['<td>{:3.1f}</td>'.format(v) for v in cross_entropy])+'<td>σ²={:3.1f}</td>'.format(np.var(cross_entropy))+'</tr>'
|
72 |
+
html += '<tr><td><b>(4)</b></td>'+''.join([f'<td style="text-align:center" colspan={len(t)}>{t}</td>' for t in tokens])+'</tr>'
|
73 |
+
html += '<tr><td><b>(5)</b></td>'+''.join([f'<td style="text-align:center" colspan={len(t)}>{"{:3.1f}".format(token_sums2[i])}</td>' for i,t in enumerate(tokens)])+'</tr>'
|
74 |
+
html += '<tr><td><b>(6)</b></td>'+''.join(['<td>{:3.1f}</td>'.format(v) for v in token_sums])+'<td>σ²={:3.1f}</td>'.format(np.var(token_sums))+'</tr>'
|
75 |
+
html += '</table>'
|
76 |
+
|
77 |
+
import IPython
|
78 |
+
IPython.display.HTML(html)
|
79 |
+
```
|
80 |
+
|
81 |
+
|
82 |
+
|
83 |
+
|
84 |
+
<table style="font-size: 20px; font-family: Roboto"><tr><td><b>(1)</b></td><td style="text-align:left">d</td><td style="text-align:left">i</td><td style="text-align:left">e</td><td style="text-align:left"> </td><td style="text-align:left">k</td><td style="text-align:left">a</td><td style="text-align:left">t</td><td style="text-align:left">z</td><td style="text-align:left">e</td><td style="text-align:left"> </td><td style="text-align:left">i</td><td style="text-align:left">s</td><td style="text-align:left">t</td><td style="text-align:left"> </td><td style="text-align:left">n</td><td style="text-align:left">i</td><td style="text-align:left">e</td><td style="text-align:left">d</td><td style="text-align:left">l</td><td style="text-align:left">i</td><td style="text-align:left">c</td><td style="text-align:left">h</td></tr><tr><td><b>(2)</b></td><td>1.0</td><td>1.0</td><td>1.0</td><td>1.0</td><td>1.0</td><td>1.0</td><td>1.0</td><td>1.0</td><td>1.0</td><td>1.0</td><td>1.0</td><td>1.0</td><td>1.0</td><td>1.0</td><td>1.0</td><td>1.0</td><td>1.0</td><td>1.0</td><td>1.0</td><td>1.0</td><td>1.0</td><td>1.0</td><td>σ²=0.0</td></tr><tr><td><b>(3)</b></td><td>8.9</td><td>1.0</td><td>0.2</td><td>0.3</td><td>4.2</td><td>1.6</td><td>3.1</td><td>5.4</td><td>0.3</td><td>1.1</td><td>3.0</td><td>3.0</td><td>0.0</td><td>0.0</td><td>2.6</td><td>0.6</td><td>4.4</td><td>1.9</td><td>4.0</td><td>0.0</td><td>0.0</td><td>0.0</td><td>σ²=5.0</td></tr><tr><td><b>(4)</b></td><td style="text-align:center" colspan=3>die</td><td style="text-align:center" colspan=1> </td><td style="text-align:center" colspan=1>k</td><td style="text-align:center" colspan=2>at</td><td style="text-align:center" colspan=2>ze</td><td style="text-align:center" colspan=1> </td><td style="text-align:center" colspan=3>ist</td><td style="text-align:center" colspan=1> </td><td style="text-align:center" colspan=1>n</td><td style="text-align:center" colspan=3>ied</td><td style="text-align:center" colspan=4>lich</td></tr><tr><td><b>(5)</b></td><td style="text-align:center" colspan=3>10.1</td><td style="text-align:center" colspan=1>0.3</td><td style="text-align:center" colspan=1>4.2</td><td style="text-align:center" colspan=2>4.7</td><td style="text-align:center" colspan=2>5.7</td><td style="text-align:center" colspan=1>1.1</td><td style="text-align:center" colspan=3>6.1</td><td style="text-align:center" colspan=1>0.0</td><td style="text-align:center" colspan=1>2.6</td><td style="text-align:center" colspan=3>6.9</td><td style="text-align:center" colspan=4>4.1</td></tr><tr><td><b>(6)</b></td><td>3.4</td><td>3.4</td><td>3.4</td><td>0.3</td><td>4.2</td><td>2.3</td><td>2.3</td><td>2.8</td><td>2.8</td><td>1.1</td><td>2.0</td><td>2.0</td><td>2.0</td><td>0.0</td><td>2.6</td><td>2.3</td><td>2.3</td><td>2.3</td><td>1.0</td><td>1.0</td><td>1.0</td><td>1.0</td><td>σ²=1.1</td></tr></table>
|
85 |
+
|
86 |
+
|
87 |
+
|
88 |
+
|
89 |
+
```python
|
90 |
+
from text_tokenizer import HajoTextTokenizer
|
91 |
+
text_tokenizer = HajoTextTokenizer(data_folder+'/'+tokenizer_file)
|
92 |
+
tt = text_tokenizer.all_tokens
|
93 |
+
print(', '.join(tt))
|
94 |
+
```
|
95 |
+
|
96 |
+
<pad>, <eos>, , chen, sche, lich, isch, icht, iche, eine, rden, tion, urde, haft, eich, rung, chte, ssen, chaf, nder, tlic, tung, eite, iert, sich, ngen, erde, scha, nden, unge, lung, mmen, eren, ende, inde, erun, sten, iese, igen, erte, iner, tsch, keit, der, die, ter, und, ein, ist, den, ten, ber, ver, sch, ung, ste, ent, ach, nte, auf, ben, eit, des, ers, aus, das, von, ren, gen, nen, lle, hre, mit, iel, uch, lte, ann, lie, men, dem, and, ind, als, sta, elt, ges, tte, ern, wir, ell, war, ere, rch, abe, len, ige, ied, ger, nnt, wei, ele, och, sse, end, all, ahr, bei, sie, ede, ion, ieg, ege, auc, che, rie, eis, vor, her, ang, für, ass, uss, tel, er, in, ge, en, st, ie, an, te, be, re, zu, ar, es, ra, al, or, ch, et, ei, un, le, rt, se, is, ha, we, at, me, ne, ur, he, au, ro, ti, li, ri, eh, im, ma, tr, ig, el, um, la, am, de, so, ol, tz, il, on, it, sc, sp, ko, na, pr, ni, si, fe, wi, ns, ke, ut, da, gr, eu, mi, hr, ze, hi, ta, ss, ng, sa, us, ba, ck, em, kt, ka, ve, fr, bi, wa, ah, gt, di, ab, fo, to, rk, as, ag, gi, hn, s, t, n, m, r, l, f, e, a, b, d, h, k, g, o, i, u, w, p, z, ä, ü, v, ö, j, c, y, x, q, á, í, ō, ó, š, é, č, ?
|
97 |
+
|
98 |
+
|