File size: 4,102 Bytes
2f9df49
 
 
 
 
 
 
 
 
 
 
 
 
 
d89fd8e
 
2f9df49
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d89fd8e
2f9df49
 
 
 
d89fd8e
 
2f9df49
 
d7cb368
2f9df49
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
import io
import sys
import gradio as gr
import srt
import jiwer

from dataclasses import dataclass
from dataclasses_json import dataclass_json
from datetime import timedelta


@dataclass_json
@dataclass
class ZHTW_Sub:
    start: timedelta
    end: timedelta
    zh: str
    tw: str

def read_srt(p):
    with open(p) as f:
        subs = list(srt.parse(f.read()))
    return subs
   
def merge_sub(subs):
    i = 1
    while i < len(subs):
        ps = subs[i-1]
        s = subs[i]
        if ps.end != s.start:
            i += 1
            continue

        ps.end = s.end
        ps.zh += f" {s.zh}"
        ps.tw += f" {s.tw}"
        subs.pop(i)
    return subs

def merge_sub2(subs, delta):
    i = 1
    while i < len(subs):
        ps = subs[i-1]
        s = subs[i]
        if s.start - ps.end > delta:
            i += 1
            continue

        ps.end = s.end
        ps.zh += f" {s.zh}"
        ps.tw += f" {s.tw}"
        subs.pop(i)
    return subs

def filter_sub(subs):
    buffer = io.StringIO()
    stdout_bak = sys.stdout
    sys.stdout = buffer  # Redirect print to buffer
    
    new_subs = []
    carry_next = False
    for s in subs:
        content = s.content
        if '#' in s.content:
            print('註:標記', s.start, s.end, s.content)
            continue
        
        if '\n' in content:
            print('修:分行', '\\n', s.start, content)
            carry_next = True
            continue #?
        else:
            content = [content]
            
        if len(content) != 1:
            print('註:多行', '\\n', s.start, content)
            print(s.start, s.end)

        tw_all, zh_all = [], []
        for cnt in content:
            if '|' in cnt:
                if len(cnt.split('|')) %2 != 0:
                    print('修:多槓', cnt.split('|'))
                    continue
                tw, zh = cnt.split('|')
                tw, zh = (t.strip() for t in [tw, zh])
    
            else:
                sp = cnt.split()
                if len(sp) %2!=0:
                    print('修:不均', s.start, s.end, sp)
                    continue
                else:
                    mid = len(sp)//2
                    tw, zh = sp[:mid], sp[mid:]
                    tw, zh = (' '.join(t) for t in [tw, zh])
                    if jiwer.cer(tw, zh) > 1:
                        print('註:差距', s.start, s.end, 'tw:', tw, 'zh:', zh)
            tw_all.append(tw)
            zh_all.append(zh)
        if carry_next:
            new_subs[-1].zh += f" {zh}"
            new_subs[-1].tw += f" {tw}"
            new_subs[-1].end = s.end
            carry_next = False
        else:
            new_sub = ZHTW_Sub(s.start, s.end, zh, tw)
            new_subs.append(new_sub)
    sys.stdout = stdout_bak
    return new_subs, buffer

def update_yield():
    buffer = []
    def update_print(inp):
        buffer.append(str(inp))
        return '\n'.join(buffer)
    return update_print

def parse_srt(file):
    if file is None:
        return "No file uploaded."

    upd = update_yield()
    yield upd(file.name)
    subs = read_srt(file.name)
    yield upd(len(subs))
    new_subs, logs = filter_sub(subs)
    yield upd(logs.getvalue())
    yield upd(len(new_subs))
    new_subs = merge_sub(new_subs)
    yield upd(len(new_subs))

    # ep_name = file.name.replace('-dedup', '')
    # ep_name = ep_name.replace('.fix', '')
    total_dur = 0
    for i, it in enumerate(new_subs):
        if (it.end-it.start).total_seconds() > 30:
            yield upd(i)
            yield upd(it.end.total_seconds(), (it.end-it.start).total_seconds(), it.tw)
        total_dur += (it.end-it.start).total_seconds()
    yield upd("可用時長 "+str(timedelta(seconds=int(total_dur))))

with gr.Blocks() as demo:
    gr.Markdown("## SRT File Validator")

    with gr.Column():
        file_input = gr.File(label="Upload .srt File", file_types=[".srt"])
        output_log = gr.Textbox(label="Parsing Log", lines=10, max_lines=120)

    file_input.change(fn=parse_srt, inputs=file_input, outputs=output_log)

demo.launch()