KevinHuSh commited on
Commit
f4456af
·
1 Parent(s): 95c6cbb

init python part (#7)

Browse files
.gitignore CHANGED
@@ -2,6 +2,7 @@
2
  # will have compiled files and executables
3
  debug/
4
  target/
 
5
 
6
  # Remove Cargo.lock from gitignore if creating an executable, leave it for libraries
7
  # More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html
@@ -12,6 +13,7 @@ Cargo.lock
12
 
13
  # MSVC Windows builds of rustc generate these, which store debugging information
14
  *.pdb
 
15
 
16
  .idea/
17
  .env
 
2
  # will have compiled files and executables
3
  debug/
4
  target/
5
+ __pycache__/
6
 
7
  # Remove Cargo.lock from gitignore if creating an executable, leave it for libraries
8
  # More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html
 
13
 
14
  # MSVC Windows builds of rustc generate these, which store debugging information
15
  *.pdb
16
+ *.trie
17
 
18
  .idea/
19
  .env
python/conf/logging.json ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version":1,
3
+ "disable_existing_loggers":false,
4
+ "formatters":{
5
+ "simple":{
6
+ "format":"%(asctime)s - %(name)s - %(levelname)s - %(filename)s - %(lineno)d - %(message)s"
7
+ }
8
+ },
9
+ "handlers":{
10
+ "console":{
11
+ "class":"logging.StreamHandler",
12
+ "level":"DEBUG",
13
+ "formatter":"simple",
14
+ "stream":"ext://sys.stdout"
15
+ },
16
+ "info_file_handler":{
17
+ "class":"logging.handlers.TimedRotatingFileHandler",
18
+ "level":"INFO",
19
+ "formatter":"simple",
20
+ "filename":"log/info.log",
21
+ "when": "MIDNIGHT",
22
+ "interval":1,
23
+ "backupCount":30,
24
+ "encoding":"utf8"
25
+ },
26
+ "error_file_handler":{
27
+ "class":"logging.handlers.TimedRotatingFileHandler",
28
+ "level":"ERROR",
29
+ "formatter":"simple",
30
+ "filename":"log/errors.log",
31
+ "when": "MIDNIGHT",
32
+ "interval":1,
33
+ "backupCount":30,
34
+ "encoding":"utf8"
35
+ }
36
+ },
37
+ "root":{
38
+ "level":"DEBUG",
39
+ "handlers":["console","info_file_handler","error_file_handler"]
40
+ }
41
+ }
python/conf/mapping.json ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "settings": {
3
+ "index": {
4
+ "number_of_shards": 4,
5
+ "number_of_replicas": 0,
6
+ "refresh_interval" : "1000ms"
7
+ },
8
+ "similarity": {
9
+ "scripted_sim": {
10
+ "type": "scripted",
11
+ "script": {
12
+ "source": "double idf = Math.log(1+(field.docCount-term.docFreq+0.5)/(term.docFreq + 0.5))/Math.log(1+((field.docCount-0.5)/1.5)); return query.boost * idf * Math.min(doc.freq, 1);"
13
+ }
14
+ }
15
+ }
16
+ },
17
+ "mappings": {
18
+ "properties": {
19
+ "lat_lon": {"type": "geo_point", "store":"true"}
20
+ },
21
+ "date_detection": "true",
22
+ "dynamic_templates": [
23
+ {
24
+ "int": {
25
+ "match": "*_int",
26
+ "mapping": {
27
+ "type": "integer",
28
+ "store": "true"
29
+ }
30
+ }
31
+ },
32
+ {
33
+ "numeric": {
34
+ "match": "*_flt",
35
+ "mapping": {
36
+ "type": "float",
37
+ "store": true
38
+ }
39
+ }
40
+ },
41
+ {
42
+ "tks": {
43
+ "match": "*_tks",
44
+ "mapping": {
45
+ "type": "text",
46
+ "similarity": "scripted_sim",
47
+ "analyzer": "whitespace",
48
+ "store": true
49
+ }
50
+ }
51
+ },
52
+ {
53
+ "ltks":{
54
+ "match": "*_ltks",
55
+ "mapping": {
56
+ "type": "text",
57
+ "analyzer": "whitespace",
58
+ "store": true
59
+ }
60
+ }
61
+ },
62
+ {
63
+ "kwd": {
64
+ "match_pattern": "regex",
65
+ "match": "^(.*_(kwd|id|ids|uid|uids)|uid)$",
66
+ "mapping": {
67
+ "type": "keyword",
68
+ "similarity": "boolean",
69
+ "store": true
70
+ }
71
+ }
72
+ },
73
+ {
74
+ "dt": {
75
+ "match_pattern": "regex",
76
+ "match": "^.*(_dt|_time|_at)$",
77
+ "mapping": {
78
+ "type": "date",
79
+ "format": "yyyy-MM-dd HH:mm:ss||yyyy-MM-dd||yyyy-MM-dd_HH:mm:ss",
80
+ "store": true
81
+ }
82
+ }
83
+ },
84
+ {
85
+ "nested": {
86
+ "match": "*_nst",
87
+ "mapping": {
88
+ "type": "nested"
89
+ }
90
+ }
91
+ },
92
+ {
93
+ "object": {
94
+ "match": "*_obj",
95
+ "mapping": {
96
+ "type": "object",
97
+ "dynamic": "true"
98
+ }
99
+ }
100
+ },
101
+ {
102
+ "string": {
103
+ "match": "*_with_weight",
104
+ "mapping": {
105
+ "type": "text",
106
+ "index": "false",
107
+ "store": true
108
+ }
109
+ }
110
+ },
111
+ {
112
+ "string": {
113
+ "match": "*_fea",
114
+ "mapping": {
115
+ "type": "rank_feature"
116
+ }
117
+ }
118
+ },
119
+ {
120
+ "dense_vector": {
121
+ "match": "*_vec",
122
+ "mapping": {
123
+ "type": "dense_vector",
124
+ "dims": 1024,
125
+ "index": true,
126
+ "similarity": "cosine"
127
+ }
128
+ }
129
+ },
130
+ {
131
+ "binary": {
132
+ "match": "*_bin",
133
+ "mapping": {
134
+ "type": "binary"
135
+ }
136
+ }
137
+ }
138
+ ]
139
+ }
140
+ }
python/conf/sys.cnf ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ [online]
2
+ es=127.0.0.1:9200
3
+ idx_nm=toxic
4
+
python/nlp/__init__.py ADDED
File without changes
python/nlp/huchunk.py ADDED
@@ -0,0 +1,364 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import os
3
+ import copy
4
+ import base64
5
+ from dataclasses import dataclass
6
+ from typing import List
7
+ import numpy as np
8
+ from io import BytesIO
9
+
10
+
11
+ class HuChunker:
12
+
13
+ def __init__(self):
14
+ self.MAX_LVL = 12
15
+ self.proj_patt = [
16
+ (r"第[零一二三四五六七八九十百]+章", 1),
17
+ (r"第[零一二三四五六七八九十百]+[条节]", 2),
18
+ (r"[零一二三四五六七八九十百]+[、  ]", 3),
19
+ (r"[\((][零一二三四五六七八九十百]+[)\)]", 4),
20
+ (r"[0-9]+(、|\.[  ]|\.[^0-9])", 5),
21
+ (r"[0-9]+\.[0-9]+(、|[  ]|[^0-9])", 6),
22
+ (r"[0-9]+\.[0-9]+\.[0-9]+(、|[  ]|[^0-9])", 7),
23
+ (r"[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+(、|[  ]|[^0-9])", 8),
24
+ (r".{,48}[::??]@", 9),
25
+ (r"[0-9]+)", 10),
26
+ (r"[\((][0-9]+[)\)]", 11),
27
+ (r"[零一二三四五六七八九十百]+是", 12),
28
+ (r"[⚫•➢✓ ]", 12)
29
+ ]
30
+ self.lines = []
31
+
32
+ def _garbage(self, txt):
33
+ patt = [
34
+ r"(在此保证|不得以任何形式翻版|请勿传阅|仅供内部使用|未经事先书面授权)",
35
+ r"(版权(归本公司)*所有|免责声明|保留一切权力|承担全部责任|特别声明|报告中涉及)",
36
+ r"(不承担任何责任|投资者的通知事项:|任何机构和个人|本报告仅为|不构成投资)",
37
+ r"(不构成对任何个人或机构投资建议|联系其所在国家|本报告由从事证券交易)",
38
+ r"(本研究报告由|「认可投资者」|所有研究报告均以|请发邮件至)",
39
+ r"(本报告仅供|市场有风险,投资需谨慎|本报告中提及的)",
40
+ r"(本报告反映|此信息仅供|证券分析师承诺|具备证券投资咨询业务资格)",
41
+ r"^(时间|签字|签章)[::]",
42
+ r"(参考文献|目录索引|图表索引)",
43
+ r"[ ]*年[ ]+月[ ]+日",
44
+ r"^(中国证券业协会|[0-9]+年[0-9]+月[0-9]+日)$",
45
+ r"\.{10,}",
46
+ r"(———————END|帮我转发|欢迎收藏|快来关注我吧)"
47
+ ]
48
+ return any([re.search(p, txt) for p in patt])
49
+
50
+ def _proj_match(self, line):
51
+ for p, j in self.proj_patt:
52
+ if re.match(p, line):
53
+ return j
54
+ return
55
+
56
+ def _does_proj_match(self):
57
+ mat = [None for _ in range(len(self.lines))]
58
+ for i in range(len(self.lines)):
59
+ mat[i] = self._proj_match(self.lines[i])
60
+ return mat
61
+
62
+ def naive_text_chunk(self, text, ti="", MAX_LEN=612):
63
+ if text:
64
+ self.lines = [l.strip().replace(u'\u3000', u' ')
65
+ .replace(u'\xa0', u'')
66
+ for l in text.split("\n\n")]
67
+ self.lines = [l for l in self.lines if not self._garbage(l)]
68
+ self.lines = [re.sub(r"([ ]+| )", " ", l)
69
+ for l in self.lines if l]
70
+ if not self.lines:
71
+ return []
72
+ arr = self.lines
73
+
74
+ res = [""]
75
+ i = 0
76
+ while i < len(arr):
77
+ a = arr[i]
78
+ if not a:
79
+ i += 1
80
+ continue
81
+ if len(a) > MAX_LEN:
82
+ a_ = a.split("\n")
83
+ if len(a_) >= 2:
84
+ arr.pop(i)
85
+ for j in range(2, len(a_) + 1):
86
+ if len("\n".join(a_[:j])) >= MAX_LEN:
87
+ arr.insert(i, "\n".join(a_[:j - 1]))
88
+ arr.insert(i + 1, "\n".join(a_[j - 1:]))
89
+ break
90
+ else:
91
+ assert False, f"Can't split: {a}"
92
+ continue
93
+
94
+ if len(res[-1]) < MAX_LEN / 3:
95
+ res[-1] += "\n" + a
96
+ else:
97
+ res.append(a)
98
+ i += 1
99
+
100
+ if ti:
101
+ for i in range(len(res)):
102
+ if res[i].find("——来自") >= 0:
103
+ continue
104
+ res[i] += f"\t——来自“{ti}”"
105
+
106
+ return res
107
+
108
+ def _merge(self):
109
+ # merge continuous same level text
110
+ lines = [self.lines[0]] if self.lines else []
111
+ for i in range(1, len(self.lines)):
112
+ if self.mat[i] == self.mat[i - 1] \
113
+ and len(lines[-1]) < 256 \
114
+ and len(self.lines[i]) < 256:
115
+ lines[-1] += "\n" + self.lines[i]
116
+ continue
117
+ lines.append(self.lines[i])
118
+ self.lines = lines
119
+ self.mat = self._does_proj_match()
120
+ return self.mat
121
+
122
+ def text_chunks(self, text):
123
+ if text:
124
+ self.lines = [l.strip().replace(u'\u3000', u' ')
125
+ .replace(u'\xa0', u'')
126
+ for l in re.split(r"[\r\n]", text)]
127
+ self.lines = [l for l in self.lines if not self._garbage(l)]
128
+ self.lines = [l for l in self.lines if l]
129
+ self.mat = self._does_proj_match()
130
+ mat = self._merge()
131
+
132
+ tree = []
133
+ for i in range(len(self.lines)):
134
+ tree.append({"proj": mat[i],
135
+ "children": [],
136
+ "read": False})
137
+ # find all children
138
+ for i in range(len(self.lines) - 1):
139
+ if tree[i]["proj"] is None:
140
+ continue
141
+ ed = i + 1
142
+ while ed < len(tree) and (tree[ed]["proj"] is None or
143
+ tree[ed]["proj"] > tree[i]["proj"]):
144
+ ed += 1
145
+
146
+ nxt = tree[i]["proj"] + 1
147
+ st = set([p["proj"] for p in tree[i + 1: ed] if p["proj"]])
148
+ while nxt not in st:
149
+ nxt += 1
150
+ if nxt > self.MAX_LVL:
151
+ break
152
+ if nxt <= self.MAX_LVL:
153
+ for j in range(i + 1, ed):
154
+ if tree[j]["proj"] is not None:
155
+ break
156
+ tree[i]["children"].append(j)
157
+ for j in range(i + 1, ed):
158
+ if tree[j]["proj"] != nxt:
159
+ continue
160
+ tree[i]["children"].append(j)
161
+ else:
162
+ for j in range(i + 1, ed):
163
+ tree[i]["children"].append(j)
164
+
165
+ # get DFS combinations, find all the paths to leaf
166
+ paths = []
167
+
168
+ def dfs(i, path):
169
+ nonlocal tree, paths
170
+ path.append(i)
171
+ tree[i]["read"] = True
172
+ if len(self.lines[i]) > 256:
173
+ paths.append(path)
174
+ return
175
+ if not tree[i]["children"]:
176
+ if len(path) > 1 or len(self.lines[i]) >= 32:
177
+ paths.append(path)
178
+ return
179
+ for j in tree[i]["children"]:
180
+ dfs(j, copy.deepcopy(path))
181
+
182
+ for i, t in enumerate(tree):
183
+ if t["read"]:
184
+ continue
185
+ dfs(i, [])
186
+
187
+ # concat txt on the path for all paths
188
+ res = []
189
+ lines = np.array(self.lines)
190
+ for p in paths:
191
+ if len(p) < 2:
192
+ tree[p[0]]["read"] = False
193
+ continue
194
+ txt = "\n".join(lines[p[:-1]]) + "\n" + lines[p[-1]]
195
+ res.append(txt)
196
+ # concat continuous orphans
197
+ assert len(tree) == len(lines)
198
+ ii = 0
199
+ while ii < len(tree):
200
+ if tree[ii]["read"]:
201
+ ii += 1
202
+ continue
203
+ txt = lines[ii]
204
+ e = ii + 1
205
+ while e < len(tree) and not tree[e]["read"] and len(txt) < 256:
206
+ txt += "\n" + lines[e]
207
+ e += 1
208
+ res.append(txt)
209
+ ii = e
210
+
211
+ # if the node has not been read, find its daddy
212
+ def find_daddy(st):
213
+ nonlocal lines, tree
214
+ proj = tree[st]["proj"]
215
+ if len(self.lines[st]) > 512:
216
+ return [st]
217
+ if proj is None:
218
+ proj = self.MAX_LVL + 1
219
+ for i in range(st - 1, -1, -1):
220
+ if tree[i]["proj"] and tree[i]["proj"] < proj:
221
+ a = [st] + find_daddy(i)
222
+ return a
223
+ return []
224
+
225
+ return res
226
+
227
+
228
+ class PdfChunker(HuChunker):
229
+
230
+ @dataclass
231
+ class Fields:
232
+ text_chunks: List = None
233
+ table_chunks: List = None
234
+
235
+ def __init__(self, pdf_parser):
236
+ self.pdf = pdf_parser
237
+ super().__init__()
238
+
239
+ def tableHtmls(self, pdfnm):
240
+ _, tbls = self.pdf(pdfnm, return_html=True)
241
+ res = []
242
+ for img, arr in tbls:
243
+ if arr[0].find("<table>") < 0:
244
+ continue
245
+ buffered = BytesIO()
246
+ if img:
247
+ img.save(buffered, format="JPEG")
248
+ img_str = base64.b64encode(
249
+ buffered.getvalue()).decode('utf-8') if img else ""
250
+ res.append({"table": arr[0], "image": img_str})
251
+ return res
252
+
253
+ def html(self, pdfnm):
254
+ txts, tbls = self.pdf(pdfnm, return_html=True)
255
+ res = []
256
+ txt_cks = self.text_chunks(txts)
257
+ for txt, img in [(self.pdf.remove_tag(c), self.pdf.crop(c))
258
+ for c in txt_cks]:
259
+ buffered = BytesIO()
260
+ if img:
261
+ img.save(buffered, format="JPEG")
262
+ img_str = base64.b64encode(
263
+ buffered.getvalue()).decode('utf-8') if img else ""
264
+ res.append({"table": "<p>%s</p>" % txt.replace("\n", "<br/>"),
265
+ "image": img_str})
266
+
267
+ for img, arr in tbls:
268
+ if not arr:
269
+ continue
270
+ buffered = BytesIO()
271
+ if img:
272
+ img.save(buffered, format="JPEG")
273
+ img_str = base64.b64encode(
274
+ buffered.getvalue()).decode('utf-8') if img else ""
275
+ res.append({"table": arr[0], "image": img_str})
276
+
277
+ return res
278
+
279
+ def __call__(self, pdfnm, return_image=True, naive_chunk=False):
280
+ flds = self.Fields()
281
+ text, tbls = self.pdf(pdfnm)
282
+ fnm = pdfnm
283
+ txt_cks = self.text_chunks(text) if not naive_chunk else \
284
+ self.naive_text_chunk(text, ti=fnm if isinstance(fnm, str) else "")
285
+ flds.text_chunks = [(self.pdf.remove_tag(c),
286
+ self.pdf.crop(c) if return_image else None) for c in txt_cks]
287
+
288
+ flds.table_chunks = [(arr, img if return_image else None)
289
+ for img, arr in tbls]
290
+ return flds
291
+
292
+
293
+ class DocxChunker(HuChunker):
294
+ def __init__(self, doc_parser):
295
+ self.doc = doc_parser
296
+ super().__init__()
297
+
298
+ def _does_proj_match(self):
299
+ mat = []
300
+ for s in self.styles:
301
+ s = s.split(" ")[-1]
302
+ try:
303
+ mat.append(int(s))
304
+ except Exception as e:
305
+ mat.append(None)
306
+ return mat
307
+
308
+ def _merge(self):
309
+ i = 1
310
+ while i < len(self.lines):
311
+ if self.mat[i] == self.mat[i - 1] \
312
+ and len(self.lines[i - 1]) < 256 \
313
+ and len(self.lines[i]) < 256:
314
+ self.lines[i - 1] += "\n" + self.lines[i]
315
+ self.styles.pop(i)
316
+ self.lines.pop(i)
317
+ self.mat.pop(i)
318
+ continue
319
+ i += 1
320
+ self.mat = self._does_proj_match()
321
+ return self.mat
322
+
323
+ def __call__(self, fnm):
324
+ flds = self.Fields()
325
+ flds.title = os.path.splitext(
326
+ os.path.basename(fnm))[0] if isinstance(
327
+ fnm, type("")) else ""
328
+ secs, tbls = self.doc(fnm)
329
+ self.lines = [l for l, s in secs]
330
+ self.styles = [s for l, s in secs]
331
+
332
+ txt_cks = self.text_chunks("")
333
+ flds.text_chunks = [(t, None) for t in txt_cks if not self._garbage(t)]
334
+ flds.table_chunks = [(tb, None) for tb in tbls for t in tb if t]
335
+ return flds
336
+
337
+
338
+ class ExcelChunker(HuChunker):
339
+ def __init__(self, excel_parser):
340
+ self.excel = excel_parser
341
+ super().__init__()
342
+
343
+ def __call__(self, fnm):
344
+ flds = self.Fields()
345
+ flds.text_chunks = [(t, None) for t in self.excel(fnm)]
346
+ flds.table_chunks = []
347
+ return flds
348
+
349
+
350
+ if __name__ == "__main__":
351
+ import sys
352
+ sys.path.append(os.path.dirname(__file__) + "/../")
353
+ if sys.argv[1].split(".")[-1].lower() == "pdf":
354
+ from parser import PdfParser
355
+ ckr = PdfChunker(PdfParser())
356
+ if sys.argv[1].split(".")[-1].lower().find("doc") >= 0:
357
+ from .parser import DocxParser
358
+ ckr = DocxChunker(DocxParser())
359
+ if sys.argv[1].split(".")[-1].lower().find("xlsx") >= 0:
360
+ from .parser import ExcelParser
361
+ ckr = ExcelChunker(ExcelParser())
362
+
363
+ # ckr.html(sys.argv[1])
364
+ print(ckr(sys.argv[1]))
python/nlp/huqie.py ADDED
@@ -0,0 +1,411 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+
3
+ import copy
4
+ import datrie
5
+ import math
6
+ import os
7
+ import re
8
+ import string
9
+ import sys
10
+ from hanziconv import HanziConv
11
+
12
+
13
+ class Huqie:
14
+ def key_(self, line):
15
+ return str(line.lower().encode("utf-8"))[2:-1]
16
+
17
+ def rkey_(self, line):
18
+ return str(("DD" + (line[::-1].lower())).encode("utf-8"))[2:-1]
19
+
20
+ def loadDict_(self, fnm):
21
+ print("[HUQIE]:Build trie", fnm, file=sys.stderr)
22
+ try:
23
+ of = open(fnm, "r")
24
+ while True:
25
+ line = of.readline()
26
+ if not line:
27
+ break
28
+ line = re.sub(r"[\r\n]+", "", line)
29
+ line = re.split(r"[ \t]", line)
30
+ k = self.key_(line[0])
31
+ F = int(math.log(float(line[1]) / self.DENOMINATOR) + .5)
32
+ if k not in self.trie_ or self.trie_[k][0] < F:
33
+ self.trie_[self.key_(line[0])] = (F, line[2])
34
+ self.trie_[self.rkey_(line[0])] = 1
35
+ self.trie_.save(fnm + ".trie")
36
+ of.close()
37
+ except Exception as e:
38
+ print("[HUQIE]:Faild to build trie, ", fnm, e, file=sys.stderr)
39
+
40
+ def __init__(self, debug=False):
41
+ self.DEBUG = debug
42
+ self.DENOMINATOR = 1000000
43
+ self.trie_ = datrie.Trie(string.printable)
44
+ self.DIR_ = ""
45
+ if os.path.exists("../res/huqie.txt"):
46
+ self.DIR_ = "../res/huqie"
47
+ if os.path.exists("./res/huqie.txt"):
48
+ self.DIR_ = "./res/huqie"
49
+ if os.path.exists("./huqie.txt"):
50
+ self.DIR_ = "./huqie"
51
+ assert self.DIR_, f"【Can't find huqie】"
52
+
53
+ self.SPLIT_CHAR = r"([ ,\.<>/?;'\[\]\\`!@#$%^&*\(\)\{\}\|_+=《》,。?、;‘’:“”【】~!¥%……()——-]+|[a-z\.-]+|[0-9,\.-]+)"
54
+ try:
55
+ self.trie_ = datrie.Trie.load(self.DIR_ + ".txt.trie")
56
+ return
57
+ except Exception as e:
58
+ print("[HUQIE]:Build default trie", file=sys.stderr)
59
+ self.trie_ = datrie.Trie(string.printable)
60
+
61
+ self.loadDict_(self.DIR_ + ".txt")
62
+
63
+ def loadUserDict(self, fnm):
64
+ try:
65
+ self.trie_ = datrie.Trie.load(fnm + ".trie")
66
+ return
67
+ except Exception as e:
68
+ self.trie_ = datrie.Trie(string.printable)
69
+ self.loadDict_(fnm)
70
+
71
+ def addUserDict(self, fnm):
72
+ self.loadDict_(fnm)
73
+
74
+ def _strQ2B(self, ustring):
75
+ """把字符串全角转半角"""
76
+ rstring = ""
77
+ for uchar in ustring:
78
+ inside_code = ord(uchar)
79
+ if inside_code == 0x3000:
80
+ inside_code = 0x0020
81
+ else:
82
+ inside_code -= 0xfee0
83
+ if inside_code < 0x0020 or inside_code > 0x7e: # 转完之后不是半角字符返回原来的字符
84
+ rstring += uchar
85
+ else:
86
+ rstring += chr(inside_code)
87
+ return rstring
88
+
89
+ def _tradi2simp(self, line):
90
+ return HanziConv.toSimplified(line)
91
+
92
+ def dfs_(self, chars, s, preTks, tkslist):
93
+ MAX_L = 10
94
+ res = s
95
+ # if s > MAX_L or s>= len(chars):
96
+ if s >= len(chars):
97
+ tkslist.append(preTks)
98
+ return res
99
+
100
+ # pruning
101
+ S = s + 1
102
+ if s + 2 <= len(chars):
103
+ t1, t2 = "".join(chars[s:s + 1]), "".join(chars[s:s + 2])
104
+ if self.trie_.has_keys_with_prefix(self.key_(t1)) and not self.trie_.has_keys_with_prefix(
105
+ self.key_(t2)):
106
+ S = s + 2
107
+ if len(preTks) > 2 and len(
108
+ preTks[-1][0]) == 1 and len(preTks[-2][0]) == 1 and len(preTks[-3][0]) == 1:
109
+ t1 = preTks[-1][0] + "".join(chars[s:s + 1])
110
+ if self.trie_.has_keys_with_prefix(self.key_(t1)):
111
+ S = s + 2
112
+
113
+ ################
114
+ for e in range(S, len(chars) + 1):
115
+ t = "".join(chars[s:e])
116
+ k = self.key_(t)
117
+
118
+ if e > s + 1 and not self.trie_.has_keys_with_prefix(k):
119
+ break
120
+
121
+ if k in self.trie_:
122
+ pretks = copy.deepcopy(preTks)
123
+ if k in self.trie_:
124
+ pretks.append((t, self.trie_[k]))
125
+ else:
126
+ pretks.append((t, (-12, '')))
127
+ res = max(res, self.dfs_(chars, e, pretks, tkslist))
128
+
129
+ if res > s:
130
+ return res
131
+
132
+ t = "".join(chars[s:s + 1])
133
+ k = self.key_(t)
134
+ if k in self.trie_:
135
+ preTks.append((t, self.trie_[k]))
136
+ else:
137
+ preTks.append((t, (-12, '')))
138
+
139
+ return self.dfs_(chars, s + 1, preTks, tkslist)
140
+
141
+ def freq(self, tk):
142
+ k = self.key_(tk)
143
+ if k not in self.trie_:
144
+ return 0
145
+ return int(math.exp(self.trie_[k][0]) * self.DENOMINATOR + 0.5)
146
+
147
+ def tag(self, tk):
148
+ k = self.key_(tk)
149
+ if k not in self.trie_:
150
+ return ""
151
+ return self.trie_[k][1]
152
+
153
+ def score_(self, tfts):
154
+ B = 30
155
+ F, L, tks = 0, 0, []
156
+ for tk, (freq, tag) in tfts:
157
+ F += freq
158
+ L += 0 if len(tk) < 2 else 1
159
+ tks.append(tk)
160
+ F /= len(tks)
161
+ L /= len(tks)
162
+ if self.DEBUG:
163
+ print("[SC]", tks, len(tks), L, F, B / len(tks) + L + F)
164
+ return tks, B / len(tks) + L + F
165
+
166
+ def sortTks_(self, tkslist):
167
+ res = []
168
+ for tfts in tkslist:
169
+ tks, s = self.score_(tfts)
170
+ res.append((tks, s))
171
+ return sorted(res, key=lambda x: x[1], reverse=True)
172
+
173
+ def merge_(self, tks):
174
+ patts = [
175
+ (r"[ ]+", " "),
176
+ (r"([0-9\+\.,%\*=-]) ([0-9\+\.,%\*=-])", r"\1\2"),
177
+ ]
178
+ # for p,s in patts: tks = re.sub(p, s, tks)
179
+
180
+ # if split chars is part of token
181
+ res = []
182
+ tks = re.sub(r"[ ]+", " ", tks).split(" ")
183
+ s = 0
184
+ while True:
185
+ if s >= len(tks):
186
+ break
187
+ E = s + 1
188
+ for e in range(s + 2, min(len(tks) + 2, s + 6)):
189
+ tk = "".join(tks[s:e])
190
+ if re.search(self.SPLIT_CHAR, tk) and self.freq(tk):
191
+ E = e
192
+ res.append("".join(tks[s:E]))
193
+ s = E
194
+
195
+ return " ".join(res)
196
+
197
+ def maxForward_(self, line):
198
+ res = []
199
+ s = 0
200
+ while s < len(line):
201
+ e = s + 1
202
+ t = line[s:e]
203
+ while e < len(line) and self.trie_.has_keys_with_prefix(
204
+ self.key_(t)):
205
+ e += 1
206
+ t = line[s:e]
207
+
208
+ while e - 1 > s and self.key_(t) not in self.trie_:
209
+ e -= 1
210
+ t = line[s:e]
211
+
212
+ if self.key_(t) in self.trie_:
213
+ res.append((t, self.trie_[self.key_(t)]))
214
+ else:
215
+ res.append((t, (0, '')))
216
+
217
+ s = e
218
+
219
+ return self.score_(res)
220
+
221
+ def maxBackward_(self, line):
222
+ res = []
223
+ s = len(line) - 1
224
+ while s >= 0:
225
+ e = s + 1
226
+ t = line[s:e]
227
+ while s > 0 and self.trie_.has_keys_with_prefix(self.rkey_(t)):
228
+ s -= 1
229
+ t = line[s:e]
230
+
231
+ while s + 1 < e and self.key_(t) not in self.trie_:
232
+ s += 1
233
+ t = line[s:e]
234
+
235
+ if self.key_(t) in self.trie_:
236
+ res.append((t, self.trie_[self.key_(t)]))
237
+ else:
238
+ res.append((t, (0, '')))
239
+
240
+ s -= 1
241
+
242
+ return self.score_(res[::-1])
243
+
244
+ def qie(self, line):
245
+ line = self._strQ2B(line).lower()
246
+ line = self._tradi2simp(line)
247
+ arr = re.split(self.SPLIT_CHAR, line)
248
+ res = []
249
+ for L in arr:
250
+ if len(L) < 2 or re.match(
251
+ r"[a-z\.-]+$", L) or re.match(r"[0-9\.-]+$", L):
252
+ res.append(L)
253
+ continue
254
+ # print(L)
255
+
256
+ # use maxforward for the first time
257
+ tks, s = self.maxForward_(L)
258
+ tks1, s1 = self.maxBackward_(L)
259
+ if self.DEBUG:
260
+ print("[FW]", tks, s)
261
+ print("[BW]", tks1, s1)
262
+
263
+ diff = [0 for _ in range(max(len(tks1), len(tks)))]
264
+ for i in range(min(len(tks1), len(tks))):
265
+ if tks[i] != tks1[i]:
266
+ diff[i] = 1
267
+
268
+ if s1 > s:
269
+ tks = tks1
270
+
271
+ i = 0
272
+ while i < len(tks):
273
+ s = i
274
+ while s < len(tks) and diff[s] == 0:
275
+ s += 1
276
+ if s == len(tks):
277
+ res.append(" ".join(tks[i:]))
278
+ break
279
+ if s > i:
280
+ res.append(" ".join(tks[i:s]))
281
+
282
+ e = s
283
+ while e < len(tks) and e - s < 5 and diff[e] == 1:
284
+ e += 1
285
+
286
+ tkslist = []
287
+ self.dfs_("".join(tks[s:e + 1]), 0, [], tkslist)
288
+ res.append(" ".join(self.sortTks_(tkslist)[0][0]))
289
+
290
+ i = e + 1
291
+
292
+ res = " ".join(res)
293
+ if self.DEBUG:
294
+ print("[TKS]", self.merge_(res))
295
+ return self.merge_(res)
296
+
297
+ def qieqie(self, tks):
298
+ res = []
299
+ for tk in tks.split(" "):
300
+ if len(tk) < 3 or re.match(r"[0-9,\.-]+$", tk):
301
+ res.append(tk)
302
+ continue
303
+ tkslist = []
304
+ if len(tk) > 10:
305
+ tkslist.append(tk)
306
+ else:
307
+ self.dfs_(tk, 0, [], tkslist)
308
+ if len(tkslist) < 2:
309
+ res.append(tk)
310
+ continue
311
+ stk = self.sortTks_(tkslist)[1][0]
312
+ if len(stk) == len(tk):
313
+ stk = tk
314
+ else:
315
+ if re.match(r"[a-z\.-]+$", tk):
316
+ for t in stk:
317
+ if len(t) < 3:
318
+ stk = tk
319
+ break
320
+ else:
321
+ stk = " ".join(stk)
322
+ else:
323
+ stk = " ".join(stk)
324
+
325
+ res.append(stk)
326
+
327
+ return " ".join(res)
328
+
329
+
330
+ def is_chinese(s):
331
+ if s >= u'\u4e00' and s <= u'\u9fa5':
332
+ return True
333
+ else:
334
+ return False
335
+
336
+
337
+ def is_number(s):
338
+ if s >= u'\u0030' and s <= u'\u0039':
339
+ return True
340
+ else:
341
+ return False
342
+
343
+
344
+ def is_alphabet(s):
345
+ if (s >= u'\u0041' and s <= u'\u005a') or (
346
+ s >= u'\u0061' and s <= u'\u007a'):
347
+ return True
348
+ else:
349
+ return False
350
+
351
+
352
+ def naiveQie(txt):
353
+ tks = []
354
+ for t in txt.split(" "):
355
+ if tks and re.match(r".*[a-zA-Z]$", tks[-1]
356
+ ) and re.match(r".*[a-zA-Z]$", t):
357
+ tks.append(" ")
358
+ tks.append(t)
359
+ return tks
360
+
361
+
362
+ hq = Huqie()
363
+ qie = hq.qie
364
+ qieqie = hq.qieqie
365
+ tag = hq.tag
366
+ freq = hq.freq
367
+ loadUserDict = hq.loadUserDict
368
+ addUserDict = hq.addUserDict
369
+ tradi2simp = hq._tradi2simp
370
+ strQ2B = hq._strQ2B
371
+
372
+ if __name__ == '__main__':
373
+ huqie = Huqie(debug=True)
374
+ # huqie.addUserDict("/tmp/tmp.new.tks.dict")
375
+ tks = huqie.qie(
376
+ "哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈")
377
+ print(huqie.qieqie(tks))
378
+ tks = huqie.qie(
379
+ "公开征求意见稿提出,境外投资者可使用自有人民币或外汇投资。使用外汇投资的,可通过债券持有人在香港人民币业务清算行及香港地区经批准可进入境内银行间外汇市场进行交易的境外人民币业务参加行(以下统称香港结算行)办理外汇资金兑换。香港结算行由此所产生的头寸可到境内银行间外汇市场平盘。使用外汇投资的,在其投资的债券到期或卖出后,原则上应兑换回外汇。")
380
+ print(huqie.qieqie(tks))
381
+ tks = huqie.qie(
382
+ "多校划片就是一个小区对应多个小学初中,让买了学区房的家庭也不确定到底能上哪个学校。目的是通过这种方式为学区房降温,把就近入学落到实处。南京市长江大桥")
383
+ print(huqie.qieqie(tks))
384
+ tks = huqie.qie(
385
+ "实际上当时他们已经将业务中心偏移到安全部门和针对政府企业的部门 Scripts are compiled and cached aaaaaaaaa")
386
+ print(huqie.qieqie(tks))
387
+ tks = huqie.qie("虽然我不怎么玩")
388
+ print(huqie.qieqie(tks))
389
+ tks = huqie.qie("蓝月亮如何在外资夹击中生存,那是全宇宙最有意思的")
390
+ print(huqie.qieqie(tks))
391
+ tks = huqie.qie(
392
+ "涡轮增压发动机num最大功率,不像别的共享买车锁电子化的手段,我们接过来是否有意义,黄黄爱美食,不过,今天阿奇要讲到的这家农贸市场,说实话,还真蛮有特色的!不仅环境好,还打出了")
393
+ print(huqie.qieqie(tks))
394
+ tks = huqie.qie("这周日你去吗?这周日你有空吗?")
395
+ print(huqie.qieqie(tks))
396
+ tks = huqie.qie("Unity3D开发经验 测试开发工程师 c++双11双11 985 211 ")
397
+ print(huqie.qieqie(tks))
398
+ tks = huqie.qie(
399
+ "数据分析项目经理|数据分析挖掘|数据分析方向|商品数据分析|搜索数据分析 sql python hive tableau Cocos2d-")
400
+ print(huqie.qieqie(tks))
401
+ if len(sys.argv) < 2:
402
+ sys.exit()
403
+ huqie.DEBUG = False
404
+ huqie.loadUserDict(sys.argv[1])
405
+ of = open(sys.argv[2], "r")
406
+ while True:
407
+ line = of.readline()
408
+ if not line:
409
+ break
410
+ print(huqie.qie(line))
411
+ of.close()
python/nlp/query.py ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import re
3
+ import sys
4
+ import os
5
+ import logging
6
+ import copy
7
+ import math
8
+ from elasticsearch_dsl import Q, Search
9
+ from nlp import huqie, term_weight, synonym
10
+
11
+
12
+ class EsQueryer:
13
+ def __init__(self, es):
14
+ self.tw = term_weight.Dealer()
15
+ self.es = es
16
+ self.syn = synonym.Dealer(None)
17
+ self.flds = ["ask_tks^10", "ask_small_tks"]
18
+
19
+ @staticmethod
20
+ def subSpecialChar(line):
21
+ return re.sub(r"([:\{\}/\[\]\-\*\"\(\)\|~\^])", r"\\\1", line).strip()
22
+
23
+ @staticmethod
24
+ def isChinese(line):
25
+ arr = re.split(r"[ \t]+", line)
26
+ if len(arr) <= 3:
27
+ return True
28
+ e = 0
29
+ for t in arr:
30
+ if not re.match(r"[a-zA-Z]+$", t):
31
+ e += 1
32
+ return e * 1. / len(arr) >= 0.8
33
+
34
+ @staticmethod
35
+ def rmWWW(txt):
36
+ txt = re.sub(
37
+ r"是*(什么样的|哪家|那家|啥样|咋样了|什么时候|何时|何地|何人|是否|是不是|多少|哪里|怎么|哪儿|怎么样|如何|哪些|是啥|啥是|啊|吗|呢|吧|咋|什么|有没有|呀)是*",
38
+ "",
39
+ txt)
40
+ return re.sub(
41
+ r"(what|who|how|which|where|why|(is|are|were|was) there) (is|are|were|was)*", "", txt, re.IGNORECASE)
42
+
43
+ def question(self, txt, tbl="qa", min_match="60%"):
44
+ txt = re.sub(
45
+ r"[ \t,,。??/`!!&]+",
46
+ " ",
47
+ huqie.tradi2simp(
48
+ huqie.strQ2B(
49
+ txt.lower()))).strip()
50
+ txt = EsQueryer.rmWWW(txt)
51
+
52
+ if not self.isChinese(txt):
53
+ tks = txt.split(" ")
54
+ q = []
55
+ for i in range(1, len(tks)):
56
+ q.append("\"%s %s\"~2" % (tks[i - 1], tks[i]))
57
+ if not q:
58
+ q.append(txt)
59
+ return Q("bool",
60
+ must=Q("query_string", fields=self.flds,
61
+ type="best_fields", query=" OR ".join(q),
62
+ boost=1, minimum_should_match="60%")
63
+ ), txt.split(" ")
64
+
65
+ def needQieqie(tk):
66
+ if len(tk) < 4:
67
+ return False
68
+ if re.match(r"[0-9a-z\.\+#_\*-]+$", tk):
69
+ return False
70
+ return True
71
+
72
+ qs, keywords = [], []
73
+ for tt in self.tw.split(txt): # .split(" "):
74
+ if not tt:
75
+ continue
76
+ twts = self.tw.weights([tt])
77
+ syns = self.syn.lookup(tt)
78
+ logging.info(json.dumps(twts, ensure_ascii=False))
79
+ tms = []
80
+ for tk, w in sorted(twts, key=lambda x: x[1] * -1):
81
+ sm = huqie.qieqie(tk).split(" ") if needQieqie(tk) else []
82
+ sm = [
83
+ re.sub(
84
+ r"[ ,\./;'\[\]\\`~!@#$%\^&\*\(\)=\+_<>\?:\"\{\}\|,。;‘’【】、!¥……()——《》?:“”-]+",
85
+ "",
86
+ m) for m in sm]
87
+ sm = [EsQueryer.subSpecialChar(m) for m in sm if len(m) > 1]
88
+ sm = [m for m in sm if len(m) > 1]
89
+ if len(sm) < 2:
90
+ sm = []
91
+
92
+ keywords.append(re.sub(r"[ \\\"']+", "", tk))
93
+
94
+ tk_syns = self.syn.lookup(tk)
95
+ tk = EsQueryer.subSpecialChar(tk)
96
+ if tk.find(" ") > 0:
97
+ tk = "\"%s\"" % tk
98
+ if tk_syns:
99
+ tk = f"({tk} %s)" % " ".join(tk_syns)
100
+ if sm:
101
+ tk = f"{tk} OR \"%s\" OR (\"%s\"~2)^0.5" % (
102
+ " ".join(sm), " ".join(sm))
103
+ tms.append((tk, w))
104
+
105
+ tms = " ".join([f"({t})^{w}" for t, w in tms])
106
+
107
+ if len(twts) > 1:
108
+ tms += f" (\"%s\"~4)^1.5" % (" ".join([t for t, _ in twts]))
109
+ if re.match(r"[0-9a-z ]+$", tt):
110
+ tms = f"(\"{tt}\" OR \"%s\")" % huqie.qie(tt)
111
+
112
+ syns = " OR ".join(
113
+ ["\"%s\"^0.7" % EsQueryer.subSpecialChar(huqie.qie(s)) for s in syns])
114
+ if syns:
115
+ tms = f"({tms})^5 OR ({syns})^0.7"
116
+
117
+ qs.append(tms)
118
+
119
+ flds = copy.deepcopy(self.flds)
120
+ mst = []
121
+ if qs:
122
+ mst.append(
123
+ Q("query_string", fields=flds, type="best_fields",
124
+ query=" OR ".join([f"({t})" for t in qs if t]), boost=1, minimum_should_match=min_match)
125
+ )
126
+
127
+ return Q("bool",
128
+ must=mst,
129
+ ), keywords
130
+
131
+ def hybrid_similarity(self, avec, bvecs, atks, btkss, tkweight=0.3,
132
+ vtweight=0.7):
133
+ from sklearn.metrics.pairwise import cosine_similarity as CosineSimilarity
134
+ import numpy as np
135
+ sims = CosineSimilarity([avec], bvecs)
136
+
137
+ def toDict(tks):
138
+ d = {}
139
+ if isinstance(tks, type("")):
140
+ tks = tks.split(" ")
141
+ for t, c in self.tw.weights(tks):
142
+ if t not in d:
143
+ d[t] = 0
144
+ d[t] += c
145
+ return d
146
+
147
+ atks = toDict(atks)
148
+ btkss = [toDict(tks) for tks in btkss]
149
+ tksim = [self.similarity(atks, btks) for btks in btkss]
150
+ return np.array(sims[0]) * vtweight + np.array(tksim) * tkweight
151
+
152
+ def similarity(self, qtwt, dtwt):
153
+ if isinstance(dtwt, type("")):
154
+ dtwt = {t: w for t, w in self.tw.weights(self.tw.split(dtwt))}
155
+ if isinstance(qtwt, type("")):
156
+ qtwt = {t: w for t, w in self.tw.weights(self.tw.split(qtwt))}
157
+ s = 1e-9
158
+ for k, v in qtwt.items():
159
+ if k in dtwt:
160
+ s += v * dtwt[k]
161
+ q = 1e-9
162
+ for k, v in qtwt.items():
163
+ q += v * v
164
+ d = 1e-9
165
+ for k, v in dtwt.items():
166
+ d += v * v
167
+ return s / math.sqrt(q) / math.sqrt(d)
python/nlp/synonym.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import time
3
+ import logging
4
+ import re
5
+
6
+
7
+ class Dealer:
8
+ def __init__(self, redis=None):
9
+
10
+ self.lookup_num = 100000000
11
+ self.load_tm = time.time() - 1000000
12
+ self.dictionary = None
13
+ try:
14
+ self.dictionary = json.load(open("./synonym.json", 'r'))
15
+ except Exception as e:
16
+ pass
17
+ try:
18
+ self.dictionary = json.load(open("./res/synonym.json", 'r'))
19
+ except Exception as e:
20
+ try:
21
+ self.dictionary = json.load(open("../res/synonym.json", 'r'))
22
+ except Exception as e:
23
+ logging.warn("Miss synonym.json")
24
+ self.dictionary = {}
25
+
26
+ if not redis:
27
+ logging.warning(
28
+ "Realtime synonym is disabled, since no redis connection.")
29
+ if not len(self.dictionary.keys()):
30
+ logging.warning(f"Fail to load synonym")
31
+
32
+ self.redis = redis
33
+ self.load()
34
+
35
+ def load(self):
36
+ if not self.redis:
37
+ return
38
+
39
+ if self.lookup_num < 100:
40
+ return
41
+ tm = time.time()
42
+ if tm - self.load_tm < 3600:
43
+ return
44
+
45
+ self.load_tm = time.time()
46
+ self.lookup_num = 0
47
+ d = self.redis.get("kevin_synonyms")
48
+ if not d:
49
+ return
50
+ try:
51
+ d = json.loads(d)
52
+ self.dictionary = d
53
+ except Exception as e:
54
+ logging.error("Fail to load synonym!" + str(e))
55
+
56
+ def lookup(self, tk):
57
+ self.lookup_num += 1
58
+ self.load()
59
+ res = self.dictionary.get(re.sub(r"[ \t]+", " ", tk.lower()), [])
60
+ if isinstance(res, str):
61
+ res = [res]
62
+ return res
63
+
64
+
65
+ if __name__ == '__main__':
66
+ dl = Dealer()
67
+ print(dl.dictionary)
python/nlp/term_weight.py ADDED
@@ -0,0 +1,216 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import json
3
+ import re
4
+ import os
5
+ import numpy as np
6
+ from nlp import huqie
7
+
8
+
9
+ class Dealer:
10
+ def __init__(self):
11
+ self.stop_words = set(["请问",
12
+ "您",
13
+ "你",
14
+ "我",
15
+ "他",
16
+ "是",
17
+ "的",
18
+ "就",
19
+ "有",
20
+ "于",
21
+ "及",
22
+ "即",
23
+ "在",
24
+ "为",
25
+ "最",
26
+ "有",
27
+ "从",
28
+ "以",
29
+ "了",
30
+ "将",
31
+ "与",
32
+ "吗",
33
+ "吧",
34
+ "中",
35
+ "#",
36
+ "什么",
37
+ "怎么",
38
+ "哪个",
39
+ "哪些",
40
+ "啥",
41
+ "相关"])
42
+
43
+ def load_dict(fnm):
44
+ res = {}
45
+ f = open(fnm, "r")
46
+ while True:
47
+ l = f.readline()
48
+ if not l:
49
+ break
50
+ arr = l.replace("\n", "").split("\t")
51
+ if len(arr) < 2:
52
+ res[arr[0]] = 0
53
+ else:
54
+ res[arr[0]] = int(arr[1])
55
+
56
+ c = 0
57
+ for _, v in res.items():
58
+ c += v
59
+ if c == 0:
60
+ return set(res.keys())
61
+ return res
62
+
63
+ fnm = os.path.join(os.path.dirname(__file__), '../res/')
64
+ if not os.path.exists(fnm):
65
+ fnm = os.path.join(os.path.dirname(__file__), '../../res/')
66
+ self.ne, self.df = {}, {}
67
+ try:
68
+ self.ne = json.load(open(fnm + "ner.json", "r"))
69
+ except Exception as e:
70
+ print("[WARNING] Load ner.json FAIL!")
71
+ try:
72
+ self.df = load_dict(fnm + "term.freq")
73
+ except Exception as e:
74
+ print("[WARNING] Load term.freq FAIL!")
75
+
76
+ def pretoken(self, txt, num=False, stpwd=True):
77
+ patt = [
78
+ r"[~—\t @#%!<>,\.\?\":;'\{\}\[\]_=\(\)\|,。?》•●○↓《;‘’:“”【¥ 】…¥!、·()×`&\\/「」\\]"
79
+ ]
80
+ rewt = [
81
+ ]
82
+ for p, r in rewt:
83
+ txt = re.sub(p, r, txt)
84
+
85
+ res = []
86
+ for t in huqie.qie(txt).split(" "):
87
+ tk = t
88
+ if (stpwd and tk in self.stop_words) or (
89
+ re.match(r"[0-9]$", tk) and not num):
90
+ continue
91
+ for p in patt:
92
+ if re.match(p, t):
93
+ tk = "#"
94
+ break
95
+ tk = re.sub(r"([\+\\-])", r"\\\1", tk)
96
+ if tk != "#" and tk:
97
+ res.append(tk)
98
+ return res
99
+
100
+ def tokenMerge(self, tks):
101
+ def oneTerm(t): return len(t) == 1 or re.match(r"[0-9a-z]{1,2}$", t)
102
+
103
+ res, i = [], 0
104
+ while i < len(tks):
105
+ j = i
106
+ if i == 0 and oneTerm(tks[i]) and len(
107
+ tks) > 1 and len(tks[i + 1]) > 1: # 多 工位
108
+ res.append(" ".join(tks[0:2]))
109
+ i = 2
110
+ continue
111
+
112
+ while j < len(
113
+ tks) and tks[j] and tks[j] not in self.stop_words and oneTerm(tks[j]):
114
+ j += 1
115
+ if j - i > 1:
116
+ if j - i < 5:
117
+ res.append(" ".join(tks[i:j]))
118
+ i = j
119
+ else:
120
+ res.append(" ".join(tks[i:i + 2]))
121
+ i = i + 2
122
+ else:
123
+ if len(tks[i]) > 0:
124
+ res.append(tks[i])
125
+ i += 1
126
+ return [t for t in res if t]
127
+
128
+ def ner(self, t):
129
+ if not self.ne:
130
+ return ""
131
+ res = self.ne.get(t, "")
132
+ if res:
133
+ return res
134
+
135
+ def split(self, txt):
136
+ tks = []
137
+ for t in re.sub(r"[ \t]+", " ", txt).split(" "):
138
+ if tks and re.match(r".*[a-zA-Z]$", tks[-1]) and \
139
+ re.match(r".*[a-zA-Z]$", t) and tks and \
140
+ self.ne.get(t, "") != "func" and self.ne.get(tks[-1], "") != "func":
141
+ tks[-1] = tks[-1] + " " + t
142
+ else:
143
+ tks.append(t)
144
+ return tks
145
+
146
+ def weights(self, tks):
147
+ def skill(t):
148
+ if t not in self.sk:
149
+ return 1
150
+ return 6
151
+
152
+ def ner(t):
153
+ if not self.ne or t not in self.ne:
154
+ return 1
155
+ m = {"toxic": 2, "func": 1, "corp": 3, "loca": 3, "sch": 3, "stock": 3,
156
+ "firstnm": 1}
157
+ return m[self.ne[t]]
158
+
159
+ def postag(t):
160
+ t = huqie.tag(t)
161
+ if t in set(["r", "c", "d"]):
162
+ return 0.3
163
+ if t in set(["ns", "nt"]):
164
+ return 3
165
+ if t in set(["n"]):
166
+ return 2
167
+ if re.match(r"[0-9-]+", t):
168
+ return 2
169
+ return 1
170
+
171
+ def freq(t):
172
+ if re.match(r"[0-9\. -]+$", t):
173
+ return 10000
174
+ s = huqie.freq(t)
175
+ if not s and re.match(r"[a-z\. -]+$", t):
176
+ return 10
177
+ if not s:
178
+ s = 0
179
+
180
+ if not s and len(t) >= 4:
181
+ s = [tt for tt in huqie.qieqie(t).split(" ") if len(tt) > 1]
182
+ if len(s) > 1:
183
+ s = np.min([freq(tt) for tt in s]) / 6.
184
+ else:
185
+ s = 0
186
+
187
+ return max(s, 10)
188
+
189
+ def df(t):
190
+ if re.match(r"[0-9\. -]+$", t):
191
+ return 100000
192
+ if t in self.df:
193
+ return self.df[t] + 3
194
+ elif re.match(r"[a-z\. -]+$", t):
195
+ return 3
196
+ elif len(t) >= 4:
197
+ s = [tt for tt in huqie.qieqie(t).split(" ") if len(tt) > 1]
198
+ if len(s) > 1:
199
+ return max(3, np.min([df(tt) for tt in s]) / 6.)
200
+
201
+ return 3
202
+
203
+ def idf(s, N): return math.log10(10 + ((N - s + 0.5) / (s + 0.5)))
204
+
205
+ tw = []
206
+ for tk in tks:
207
+ tt = self.tokenMerge(self.pretoken(tk, True))
208
+ idf1 = np.array([idf(freq(t), 10000000) for t in tt])
209
+ idf2 = np.array([idf(df(t), 1000000000) for t in tt])
210
+ wts = (0.3 * idf1 + 0.7 * idf2) * \
211
+ np.array([ner(t) * postag(t) for t in tt])
212
+
213
+ tw.extend(zip(tt, wts))
214
+
215
+ S = np.sum([s for _, s in tw])
216
+ return [(t, s / S) for t, s in tw]
python/parser/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from .pdf_parser import HuParser as PdfParser
2
+ from .docx_parser import HuDocxParser as DocxParser
3
+ from .excel_parser import HuExcelParser as ExcelParser
python/parser/docx_parser.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from docx import Document
2
+ import re
3
+ import pandas as pd
4
+ from collections import Counter
5
+ from nlp import huqie
6
+
7
+
8
+ class HuDocxParser:
9
+
10
+ def __extract_table_content(self, tb):
11
+ df = []
12
+ for row in tb.rows:
13
+ df.append([c.text for c in row.cells])
14
+ return self.__compose_table_content(pd.DataFrame(df))
15
+
16
+ def __compose_table_content(self, df):
17
+
18
+ def blockType(b):
19
+ patt = [
20
+ ("^(20|19)[0-9]{2}[年/-][0-9]{1,2}[月/-][0-9]{1,2}日*$", "Dt"),
21
+ (r"^(20|19)[0-9]{2}年$", "Dt"),
22
+ (r"^(20|19)[0-9]{2}[年/-][0-9]{1,2}月*$", "Dt"),
23
+ ("^[0-9]{1,2}[月/-][0-9]{1,2}日*$", "Dt"),
24
+ (r"^第*[一二三四1-4]季度$", "Dt"),
25
+ (r"^(20|19)[0-9]{2}年*[一二三四1-4]季度$", "Dt"),
26
+ (r"^(20|19)[0-9]{2}[ABCDE]$", "DT"),
27
+ ("^[0-9.,+%/ -]+$", "Nu"),
28
+ (r"^[0-9A-Z/\._~-]+$", "Ca"),
29
+ (r"^[A-Z]*[a-z' -]+$", "En"),
30
+ (r"^[0-9.,+-]+[0-9A-Za-z/$¥%<>()()' -]+$", "NE"),
31
+ (r"^.{1}$", "Sg")
32
+ ]
33
+ for p, n in patt:
34
+ if re.search(p, b):
35
+ return n
36
+ tks = [t for t in huqie.qie(b).split(" ") if len(t) > 1]
37
+ if len(tks) > 3:
38
+ if len(tks) < 12:
39
+ return "Tx"
40
+ else:
41
+ return "Lx"
42
+
43
+ if len(tks) == 1 and huqie.tag(tks[0]) == "nr":
44
+ return "Nr"
45
+
46
+ return "Ot"
47
+
48
+ if len(df) < 2:
49
+ return []
50
+ max_type = Counter([blockType(str(df.iloc[i, j])) for i in range(
51
+ 1, len(df)) for j in range(len(df.iloc[i, :]))])
52
+ max_type = max(max_type.items(), key=lambda x: x[1])[0]
53
+
54
+ colnm = len(df.iloc[0, :])
55
+ hdrows = [0] # header is not nessesarily appear in the first line
56
+ if max_type == "Nu":
57
+ for r in range(1, len(df)):
58
+ tys = Counter([blockType(str(df.iloc[r, j]))
59
+ for j in range(len(df.iloc[r, :]))])
60
+ tys = max(tys.items(), key=lambda x: x[1])[0]
61
+ if tys != max_type:
62
+ hdrows.append(r)
63
+
64
+ lines = []
65
+ for i in range(1, len(df)):
66
+ if i in hdrows:
67
+ continue
68
+ hr = [r - i for r in hdrows]
69
+ hr = [r for r in hr if r < 0]
70
+ t = len(hr) - 1
71
+ while t > 0:
72
+ if hr[t] - hr[t - 1] > 1:
73
+ hr = hr[t:]
74
+ break
75
+ t -= 1
76
+ headers = []
77
+ for j in range(len(df.iloc[i, :])):
78
+ t = []
79
+ for h in hr:
80
+ x = str(df.iloc[i + h, j]).strip()
81
+ if x in t:
82
+ continue
83
+ t.append(x)
84
+ t = ",".join(t)
85
+ if t:
86
+ t += ": "
87
+ headers.append(t)
88
+ cells = []
89
+ for j in range(len(df.iloc[i, :])):
90
+ if not str(df.iloc[i, j]):
91
+ continue
92
+ cells.append(headers[j] + str(df.iloc[i, j]))
93
+ lines.append(";".join(cells))
94
+
95
+ if colnm > 3:
96
+ return lines
97
+ return ["\n".join(lines)]
98
+
99
+ def __call__(self, fnm):
100
+ self.doc = Document(fnm)
101
+ secs = [(p.text, p.style.name) for p in self.doc.paragraphs]
102
+ tbls = [self.__extract_table_content(tb) for tb in self.doc.tables]
103
+ return secs, tbls
python/parser/excel_parser.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from openpyxl import load_workbook
2
+ import sys
3
+
4
+
5
+ class HuExcelParser:
6
+ def __call__(self, fnm):
7
+ wb = load_workbook(fnm)
8
+ res = []
9
+ for sheetname in wb.sheetnames:
10
+ ws = wb[sheetname]
11
+ lines = []
12
+ for r in ws.rows:
13
+ lines.append(
14
+ "\t".join([str(c.value) if c.value is not None else "" for c in r]))
15
+ res.append(f"《{sheetname}》\n" + "\n".join(lines))
16
+ return res
17
+
18
+
19
+ if __name__ == "__main__":
20
+ psr = HuExcelParser()
21
+ psr(sys.argv[1])
python/parser/pdf_parser.py ADDED
@@ -0,0 +1,1635 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import xgboost as xgb
2
+ import torch
3
+ import re
4
+ import pdfplumber
5
+ import logging
6
+ from PIL import Image
7
+ import numpy as np
8
+ from nlp import huqie
9
+ from collections import Counter
10
+ from copy import deepcopy
11
+ from cv.table_recognize import TableTransformer
12
+ from cv.ppdetection import PPDet
13
+ from huggingface_hub import hf_hub_download
14
+ logging.getLogger("pdfminer").setLevel(logging.WARNING)
15
+
16
+
17
+ class HuParser:
18
+ def __init__(self):
19
+ from paddleocr import PaddleOCR
20
+ logging.getLogger("ppocr").setLevel(logging.ERROR)
21
+ self.ocr = PaddleOCR(use_angle_cls=False, lang="ch")
22
+ self.layouter = PPDet()
23
+ self.tbl_det = TableTransformer()
24
+
25
+ self.updown_cnt_mdl = xgb.Booster()
26
+ if torch.cuda.is_available():
27
+ self.updown_cnt_mdl.set_param({"device": "cuda"})
28
+ self.updown_cnt_mdl.load_model(hf_hub_download(repo_id="InfiniFlow/text_concat_xgb_v1.0",
29
+ filename="updown_concat_xgb.model"))
30
+ """
31
+ If you have trouble downloading HuggingFace models, -_^ this might help!!
32
+
33
+ For Linux:
34
+ export HF_ENDPOINT=https://hf-mirror.com
35
+
36
+ For Windows:
37
+ Good luck
38
+ ^_-
39
+
40
+ """
41
+
42
+ def __char_width(self, c):
43
+ return (c["x1"] - c["x0"]) // len(c["text"])
44
+
45
+ def __height(self, c):
46
+ return c["bottom"] - c["top"]
47
+
48
+ def _x_dis(self, a, b):
49
+ return min(abs(a["x1"] - b["x0"]), abs(a["x0"] - b["x1"]),
50
+ abs(a["x0"] + a["x1"] - b["x0"] - b["x1"]) / 2)
51
+
52
+ def _y_dis(
53
+ self, a, b):
54
+ return (
55
+ b["top"] + b["bottom"] - a["top"] - a["bottom"]) / 2
56
+
57
+ def _match_proj(self, b):
58
+ proj_patt = [
59
+ r"第[零一二三四五六七八九十百]+章",
60
+ r"第[零一二三四五六七八九十百]+[条节]",
61
+ r"[零一二三四五六七八九十百]+[、是  ]",
62
+ r"[\((][零一二三四五六七八九十百]+[)\)]",
63
+ r"[\((][0-9]+[)\)]",
64
+ r"[0-9]+(、|\.[  ]|)|\.[^0-9./a-zA-Z_%><-]{4,})",
65
+ r"[0-9]+\.[0-9.]+(、|\.[  ])",
66
+ r"[⚫•➢①② ]",
67
+ ]
68
+ return any([re.match(p, b["text"]) for p in proj_patt])
69
+
70
+ def _updown_concat_features(self, up, down):
71
+ w = max(self.__char_width(up), self.__char_width(down))
72
+ h = max(self.__height(up), self.__height(down))
73
+ y_dis = self._y_dis(up, down)
74
+ LEN = 6
75
+ tks_down = huqie.qie(down["text"][:LEN]).split(" ")
76
+ tks_up = huqie.qie(up["text"][-LEN:]).split(" ")
77
+ tks_all = up["text"][-LEN:].strip() \
78
+ + (" " if re.match(r"[a-zA-Z0-9]+",
79
+ up["text"][-1] + down["text"][0]) else "") \
80
+ + down["text"][:LEN].strip()
81
+ tks_all = huqie.qie(tks_all).split(" ")
82
+ fea = [
83
+ up.get("R", -1) == down.get("R", -1),
84
+ y_dis / h,
85
+ down["page_number"] - up["page_number"],
86
+ up["layout_type"] == down["layout_type"],
87
+ up["layout_type"] == "text",
88
+ down["layout_type"] == "text",
89
+ up["layout_type"] == "table",
90
+ down["layout_type"] == "table",
91
+ True if re.search(
92
+ r"([。?!;!?;+))]|[a-z]\.)$",
93
+ up["text"]) else False,
94
+ True if re.search(r"[,:‘“、0-9(+-]$", up["text"]) else False,
95
+ True if re.search(
96
+ r"(^.?[/,?;:\],。;:’”?!》】)-])",
97
+ down["text"]) else False,
98
+ True if re.match(r"[\((][^\(\)()]+[)\)]$", up["text"]) else False,
99
+ True if re.search(r"[,,][^。.]+$", up["text"]) else False,
100
+ True if re.search(r"[,,][^。.]+$", up["text"]) else False,
101
+ True if re.search(r"[\((][^\))]+$", up["text"])
102
+ and re.search(r"[\))]", down["text"]) else False,
103
+ self._match_proj(down),
104
+ True if re.match(r"[A-Z]", down["text"]) else False,
105
+ True if re.match(r"[A-Z]", up["text"][-1]) else False,
106
+ True if re.match(r"[a-z0-9]", up["text"][-1]) else False,
107
+ True if re.match(r"[0-9.%,-]+$", down["text"]) else False,
108
+ up["text"].strip()[-2:] == down["text"].strip()[-2:] if len(up["text"].strip()
109
+ ) > 1 and len(
110
+ down["text"].strip()) > 1 else False,
111
+ up["x0"] > down["x1"],
112
+ abs(self.__height(up) - self.__height(down)) / min(self.__height(up),
113
+ self.__height(down)),
114
+ self._x_dis(up, down) / max(w, 0.000001),
115
+ (len(up["text"]) - len(down["text"])) /
116
+ max(len(up["text"]), len(down["text"])),
117
+ len(tks_all) - len(tks_up) - len(tks_down),
118
+ len(tks_down) - len(tks_up),
119
+ tks_down[-1] == tks_up[-1],
120
+ max(down["in_row"], up["in_row"]),
121
+ abs(down["in_row"] - up["in_row"]),
122
+ len(tks_down) == 1 and huqie.tag(tks_down[0]).find("n") >= 0,
123
+ len(tks_up) == 1 and huqie.tag(tks_up[0]).find("n") >= 0
124
+ ]
125
+ return fea
126
+
127
+ @staticmethod
128
+ def sort_Y_firstly(arr, threashold):
129
+ # sort using y1 first and then x1
130
+ arr = sorted(arr, key=lambda r: (r["top"], r["x0"]))
131
+ for i in range(len(arr) - 1):
132
+ for j in range(i, -1, -1):
133
+ # restore the order using th
134
+ if abs(arr[j + 1]["top"] - arr[j]["top"]) < threashold \
135
+ and arr[j + 1]["x0"] < arr[j]["x0"]:
136
+ tmp = deepcopy(arr[j])
137
+ arr[j] = deepcopy(arr[j + 1])
138
+ arr[j + 1] = deepcopy(tmp)
139
+ return arr
140
+
141
+ @staticmethod
142
+ def sort_R_firstly(arr, thr=0):
143
+ # sort using y1 first and then x1
144
+ # sorted(arr, key=lambda r: (r["top"], r["x0"]))
145
+ arr = HuParser.sort_Y_firstly(arr, thr)
146
+ for i in range(len(arr) - 1):
147
+ for j in range(i, -1, -1):
148
+ if "R" not in arr[j] or "R" not in arr[j + 1]:
149
+ continue
150
+ if arr[j + 1]["R"] < arr[j]["R"] \
151
+ or (
152
+ arr[j + 1]["R"] == arr[j]["R"]
153
+ and arr[j + 1]["x0"] < arr[j]["x0"]
154
+ ):
155
+ tmp = arr[j]
156
+ arr[j] = arr[j + 1]
157
+ arr[j + 1] = tmp
158
+ return arr
159
+
160
+ @staticmethod
161
+ def sort_X_firstly(arr, threashold, copy=True):
162
+ # sort using y1 first and then x1
163
+ arr = sorted(arr, key=lambda r: (r["x0"], r["top"]))
164
+ for i in range(len(arr) - 1):
165
+ for j in range(i, -1, -1):
166
+ # restore the order using th
167
+ if abs(arr[j + 1]["x0"] - arr[j]["x0"]) < threashold \
168
+ and arr[j + 1]["top"] < arr[j]["top"]:
169
+ tmp = deepcopy(arr[j]) if copy else arr[j]
170
+ arr[j] = deepcopy(arr[j + 1]) if copy else arr[j + 1]
171
+ arr[j + 1] = deepcopy(tmp) if copy else tmp
172
+ return arr
173
+
174
+ @staticmethod
175
+ def sort_C_firstly(arr, thr=0):
176
+ # sort using y1 first and then x1
177
+ # sorted(arr, key=lambda r: (r["x0"], r["top"]))
178
+ arr = HuParser.sort_X_firstly(arr, thr)
179
+ for i in range(len(arr) - 1):
180
+ for j in range(i, -1, -1):
181
+ # restore the order using th
182
+ if "C" not in arr[j] or "C" not in arr[j + 1]:
183
+ continue
184
+ if arr[j + 1]["C"] < arr[j]["C"] \
185
+ or (
186
+ arr[j + 1]["C"] == arr[j]["C"]
187
+ and arr[j + 1]["top"] < arr[j]["top"]
188
+ ):
189
+ tmp = arr[j]
190
+ arr[j] = arr[j + 1]
191
+ arr[j + 1] = tmp
192
+ return arr
193
+
194
+ return sorted(arr, key=lambda r: (r.get("C", r["x0"]), r["top"]))
195
+
196
+ def _has_color(self, o):
197
+ if o.get("ncs", "") == "DeviceGray":
198
+ if o["stroking_color"] and o["stroking_color"][0] == 1 and o["non_stroking_color"] and \
199
+ o["non_stroking_color"][0] == 1:
200
+ if re.match(r"[a-zT_\[\]\(\)-]+", o.get("text", "")):
201
+ return False
202
+ return True
203
+
204
+ def __overlapped_area(self, a, b, ratio=True):
205
+ tp, btm, x0, x1 = a["top"], a["bottom"], a["x0"], a["x1"]
206
+ if b["x0"] > x1 or b["x1"] < x0:
207
+ return 0
208
+ if b["bottom"] < tp or b["top"] > btm:
209
+ return 0
210
+ x0_ = max(b["x0"], x0)
211
+ x1_ = min(b["x1"], x1)
212
+ assert x0_ <= x1_, "Fuckedup! T:{},B:{},X0:{},X1:{} ==> {}".format(
213
+ tp, btm, x0, x1, b)
214
+ tp_ = max(b["top"], tp)
215
+ btm_ = min(b["bottom"], btm)
216
+ assert tp_ <= btm_, "Fuckedup! T:{},B:{},X0:{},X1:{} => {}".format(
217
+ tp, btm, x0, x1, b)
218
+ ov = (btm_ - tp_) * (x1_ - x0_) if x1 - \
219
+ x0 != 0 and btm - tp != 0 else 0
220
+ if ov > 0 and ratio:
221
+ ov /= (x1 - x0) * (btm - tp)
222
+ return ov
223
+
224
+ def __find_overlapped_with_threashold(self, box, boxes, thr=0.3):
225
+ if not boxes:
226
+ return
227
+ max_overlaped_i, max_overlaped, _max_overlaped = None, thr, 0
228
+ s, e = 0, len(boxes)
229
+ for i in range(s, e):
230
+ ov = self.__overlapped_area(box, boxes[i])
231
+ _ov = self.__overlapped_area(boxes[i], box)
232
+ if (ov, _ov) < (max_overlaped, _max_overlaped):
233
+ continue
234
+ max_overlaped_i = i
235
+ max_overlaped = ov
236
+ _max_overlaped = _ov
237
+
238
+ return max_overlaped_i
239
+
240
+ def __find_overlapped(self, box, boxes_sorted_by_y, naive=False):
241
+ if not boxes_sorted_by_y:
242
+ return
243
+ bxs = boxes_sorted_by_y
244
+ s, e, ii = 0, len(bxs), 0
245
+ while s < e and not naive:
246
+ ii = (e + s) // 2
247
+ pv = bxs[ii]
248
+ if box["bottom"] < pv["top"]:
249
+ e = ii
250
+ continue
251
+ if box["top"] > pv["bottom"]:
252
+ s = ii + 1
253
+ continue
254
+ break
255
+ while s < ii:
256
+ if box["top"] > bxs[s]["bottom"]:
257
+ s += 1
258
+ break
259
+ while e - 1 > ii:
260
+ if box["bottom"] < bxs[e - 1]["top"]:
261
+ e -= 1
262
+ break
263
+
264
+ max_overlaped_i, max_overlaped = None, 0
265
+ for i in range(s, e):
266
+ ov = self.__overlapped_area(bxs[i], box)
267
+ if ov <= max_overlaped:
268
+ continue
269
+ max_overlaped_i = i
270
+ max_overlaped = ov
271
+
272
+ return max_overlaped_i
273
+
274
+ def _is_garbage(self, b):
275
+ patt = [r"^•+$", r"(版权归©|免责条款|地址[::])", r"\.{3,}", "^[0-9]{1,2} / ?[0-9]{1,2}$",
276
+ r"^[0-9]{1,2} of [0-9]{1,2}$", "^http://[^ ]{12,}",
277
+ "(资料|数据)来源[::]", "[0-9a-z._-]+@[a-z0-9-]+\\.[a-z]{2,3}",
278
+ "\\(cid *: *[0-9]+ *\\)"
279
+ ]
280
+ return any([re.search(p, b["text"]) for p in patt])
281
+
282
+ def __layouts_cleanup(self, boxes, layouts, far=2, thr=0.7):
283
+ def notOverlapped(a, b):
284
+ return any([a["x1"] < b["x0"],
285
+ a["x0"] > b["x1"],
286
+ a["bottom"] < b["top"],
287
+ a["top"] > b["bottom"]])
288
+
289
+ i = 0
290
+ while i + 1 < len(layouts):
291
+ j = i + 1
292
+ while j < min(i + far, len(layouts)) \
293
+ and (layouts[i].get("type", "") != layouts[j].get("type", "")
294
+ or notOverlapped(layouts[i], layouts[j])):
295
+ j += 1
296
+ if j >= min(i + far, len(layouts)):
297
+ i += 1
298
+ continue
299
+ if self.__overlapped_area(layouts[i], layouts[j]) < thr \
300
+ and self.__overlapped_area(layouts[j], layouts[i]) < thr:
301
+ i += 1
302
+ continue
303
+
304
+ if layouts[i].get("score") and layouts[j].get("score"):
305
+ if layouts[i]["score"] > layouts[j]["score"]:
306
+ layouts.pop(j)
307
+ else:
308
+ layouts.pop(i)
309
+ continue
310
+
311
+ area_i, area_i_1 = 0, 0
312
+ for b in boxes:
313
+ if not notOverlapped(b, layouts[i]):
314
+ area_i += self.__overlapped_area(b, layouts[i], False)
315
+ if not notOverlapped(b, layouts[j]):
316
+ area_i_1 += self.__overlapped_area(b, layouts[j], False)
317
+
318
+ if area_i > area_i_1:
319
+ layouts.pop(j)
320
+ else:
321
+ layouts.pop(i)
322
+
323
+ return layouts
324
+
325
+ def __table_paddle(self, images):
326
+ tbls = self.tbl_det([np.array(img) for img in images], thr=0.5)
327
+ res = []
328
+ # align left&right for rows, align top&bottom for columns
329
+ for tbl in tbls:
330
+ lts = [{"label": b["type"],
331
+ "score": b["score"],
332
+ "x0": b["bbox"][0], "x1": b["bbox"][2],
333
+ "top": b["bbox"][1], "bottom": b["bbox"][-1]
334
+ } for b in tbl]
335
+ if not lts:
336
+ continue
337
+
338
+ left = [b["x0"] for b in lts if b["label"].find(
339
+ "row") > 0 or b["label"].find("header") > 0]
340
+ right = [b["x1"] for b in lts if b["label"].find(
341
+ "row") > 0 or b["label"].find("header") > 0]
342
+ if not left:
343
+ continue
344
+ left = np.median(left) if len(left) > 4 else np.min(left)
345
+ right = np.median(right) if len(right) > 4 else np.max(right)
346
+ for b in lts:
347
+ if b["label"].find("row") > 0 or b["label"].find("header") > 0:
348
+ if b["x0"] > left:
349
+ b["x0"] = left
350
+ if b["x1"] < right:
351
+ b["x1"] = right
352
+
353
+ top = [b["top"] for b in lts if b["label"] == "table column"]
354
+ bottom = [b["bottom"] for b in lts if b["label"] == "table column"]
355
+ if not top:
356
+ res.append(lts)
357
+ continue
358
+ top = np.median(top) if len(top) > 4 else np.min(top)
359
+ bottom = np.median(bottom) if len(bottom) > 4 else np.max(bottom)
360
+ for b in lts:
361
+ if b["label"] == "table column":
362
+ if b["top"] > top:
363
+ b["top"] = top
364
+ if b["bottom"] < bottom:
365
+ b["bottom"] = bottom
366
+
367
+ res.append(lts)
368
+ return res
369
+
370
+ def __table_transformer_job(self, ZM):
371
+ logging.info("Table processing...")
372
+ imgs, pos = [], []
373
+ tbcnt = [0]
374
+ MARGIN = 10
375
+ self.tb_cpns = []
376
+ assert len(self.page_layout) == len(self.page_images)
377
+ for p, tbls in enumerate(self.page_layout): # for page
378
+ tbls = [f for f in tbls if f["type"] == "table"]
379
+ tbcnt.append(len(tbls))
380
+ if not tbls:
381
+ continue
382
+ for tb in tbls: # for table
383
+ left, top, right, bott = tb["x0"] - MARGIN, tb["top"] - MARGIN, \
384
+ tb["x1"] + MARGIN, tb["bottom"] + MARGIN
385
+ left *= ZM
386
+ top *= ZM
387
+ right *= ZM
388
+ bott *= ZM
389
+ pos.append((left, top))
390
+ imgs.append(self.page_images[p].crop((left, top, right, bott)))
391
+
392
+ assert len(self.page_images) == len(tbcnt) - 1
393
+ if not imgs:
394
+ return
395
+ recos = self.__table_paddle(imgs)
396
+ tbcnt = np.cumsum(tbcnt)
397
+ for i in range(len(tbcnt) - 1): # for page
398
+ pg = []
399
+ for j, tb_items in enumerate(
400
+ recos[tbcnt[i]: tbcnt[i + 1]]): # for table
401
+ poss = pos[tbcnt[i]: tbcnt[i + 1]]
402
+ for it in tb_items: # for table components
403
+ it["x0"] = (it["x0"] + poss[j][0])
404
+ it["x1"] = (it["x1"] + poss[j][0])
405
+ it["top"] = (it["top"] + poss[j][1])
406
+ it["bottom"] = (it["bottom"] + poss[j][1])
407
+ for n in ["x0", "x1", "top", "bottom"]:
408
+ it[n] /= ZM
409
+ it["top"] += self.page_cum_height[i]
410
+ it["bottom"] += self.page_cum_height[i]
411
+ it["pn"] = i
412
+ it["layoutno"] = j
413
+ pg.append(it)
414
+ self.tb_cpns.extend(pg)
415
+
416
+ def __ocr_paddle(self, pagenum, img, chars, ZM=3):
417
+ bxs = self.ocr.ocr(np.array(img), cls=True)[0]
418
+ if not bxs:
419
+ self.boxes.append([])
420
+ return
421
+ bxs = [(line[0], line[1][0]) for line in bxs]
422
+ bxs = self.sort_Y_firstly(
423
+ [{"x0": b[0][0] / ZM, "x1": b[1][0] / ZM,
424
+ "top": b[0][1] / ZM, "text": "", "txt": t,
425
+ "bottom": b[-1][1] / ZM,
426
+ "page_number": pagenum} for b, t in bxs if b[0][0] <= b[1][0] and b[0][1] <= b[-1][1]],
427
+ self.mean_height[-1] / 3
428
+ )
429
+
430
+ # merge chars in the same rect
431
+ for c in self.sort_X_firstly(chars, self.mean_width[pagenum - 1] // 4):
432
+ ii = self.__find_overlapped(c, bxs)
433
+ if ii is None:
434
+ self.lefted_chars.append(c)
435
+ continue
436
+ ch = c["bottom"] - c["top"]
437
+ bh = bxs[ii]["bottom"] - bxs[ii]["top"]
438
+ if abs(ch - bh) / max(ch, bh) >= 0.7:
439
+ self.lefted_chars.append(c)
440
+ continue
441
+ bxs[ii]["text"] += c["text"]
442
+
443
+ for b in bxs:
444
+ if not b["text"]:
445
+ b["text"] = b["txt"]
446
+ del b["txt"]
447
+ if self.mean_height[-1] == 0:
448
+ self.mean_height[-1] = np.median([b["bottom"] - b["top"]
449
+ for b in bxs])
450
+
451
+ self.boxes.append(bxs)
452
+
453
+ def __layouts_paddle(self, ZM):
454
+ assert len(self.page_images) == len(self.boxes)
455
+ # Tag layout type
456
+ boxes = []
457
+ layouts = self.layouter([np.array(img) for img in self.page_images])
458
+ assert len(self.page_images) == len(layouts)
459
+ for pn, lts in enumerate(layouts):
460
+ bxs = self.boxes[pn]
461
+ lts = [{"type": b["type"],
462
+ "score": float(b["score"]),
463
+ "x0": b["bbox"][0] / ZM, "x1": b["bbox"][2] / ZM,
464
+ "top": b["bbox"][1] / ZM, "bottom": b["bbox"][-1] / ZM,
465
+ "page_number": pn,
466
+ } for b in lts]
467
+ lts = self.sort_Y_firstly(lts, self.mean_height[pn] / 2)
468
+ lts = self.__layouts_cleanup(bxs, lts)
469
+ self.page_layout.append(lts)
470
+
471
+ # Tag layout type, layouts are ready
472
+ def findLayout(ty):
473
+ nonlocal bxs, lts
474
+ lts_ = [lt for lt in lts if lt["type"] == ty]
475
+ i = 0
476
+ while i < len(bxs):
477
+ if bxs[i].get("layout_type"):
478
+ i += 1
479
+ continue
480
+ if self._is_garbage(bxs[i]):
481
+ logging.debug("GARBAGE: " + bxs[i]["text"])
482
+ bxs.pop(i)
483
+ continue
484
+
485
+ ii = self.__find_overlapped_with_threashold(bxs[i], lts_,
486
+ thr=0.4)
487
+ if ii is None: # belong to nothing
488
+ bxs[i]["layout_type"] = ""
489
+ i += 1
490
+ continue
491
+ lts_[ii]["visited"] = True
492
+ if lts_[ii]["type"] in ["footer", "header", "reference"]:
493
+ if lts_[ii]["type"] not in self.garbages:
494
+ self.garbages[lts_[ii]["type"]] = []
495
+ self.garbages[lts_[ii]["type"]].append(bxs[i]["text"])
496
+ logging.debug("GARBAGE: " + bxs[i]["text"])
497
+ bxs.pop(i)
498
+ continue
499
+
500
+ bxs[i]["layoutno"] = f"{ty}-{ii}"
501
+ bxs[i]["layout_type"] = lts_[ii]["type"]
502
+ i += 1
503
+
504
+ for lt in ["footer", "header", "reference", "figure caption",
505
+ "table caption", "title", "text", "table", "figure"]:
506
+ findLayout(lt)
507
+
508
+ # add box to figure layouts which has not text box
509
+ for i, lt in enumerate(
510
+ [lt for lt in lts if lt["type"] == "figure"]):
511
+ if lt.get("visited"):
512
+ continue
513
+ lt = deepcopy(lt)
514
+ del lt["type"]
515
+ lt["text"] = ""
516
+ lt["layout_type"] = "figure"
517
+ lt["layoutno"] = f"figure-{i}"
518
+ bxs.append(lt)
519
+
520
+ boxes.extend(bxs)
521
+
522
+ self.boxes = boxes
523
+
524
+ def __text_merge(self, garbage):
525
+ # merge adjusted boxes
526
+ bxs = self.boxes
527
+
528
+ def end_with(b, txt):
529
+ txt = txt.strip()
530
+ tt = b.get("text", "").strip()
531
+ return tt and tt.find(txt) == len(tt) - len(txt)
532
+
533
+ def start_with(b, txts):
534
+ tt = b.get("text", "").strip()
535
+ return tt and any([tt.find(t.strip()) == 0 for t in txts])
536
+
537
+ i = 0
538
+ while i < len(bxs) - 1:
539
+ b = bxs[i]
540
+ b_ = bxs[i + 1]
541
+ if b.get("layoutno", "0") != b_.get("layoutno", "1"):
542
+ i += 1
543
+ continue
544
+
545
+ dis_thr = 1
546
+ dis = b["x1"] - b_["x0"]
547
+ if b.get("layout_type", "") != "text" or b_.get(
548
+ "layout_type", "") != "text":
549
+ if end_with(b, ",") or start_with(b_, "(,"):
550
+ dis_thr = -8
551
+ else:
552
+ i += 1
553
+ continue
554
+
555
+ if abs(self._y_dis(b, b_)) < self.mean_height[bxs[i]["page_number"] - 1] / 5 \
556
+ and dis >= dis_thr and b["x1"] < b_["x1"]:
557
+ # merge
558
+ bxs[i]["x1"] = b_["x1"]
559
+ bxs[i]["top"] = (b["top"] + b_["top"]) / 2
560
+ bxs[i]["bottom"] = (b["bottom"] + b_["bottom"]) / 2
561
+ bxs[i]["text"] += b_["text"]
562
+ bxs.pop(i + 1)
563
+ continue
564
+ i += 1
565
+ self.boxes = bxs
566
+
567
+ # count boxes in the same row
568
+ for i in range(len(self.boxes)):
569
+ mh = self.mean_height[self.boxes[i]["page_number"] - 1]
570
+ self.boxes[i]["in_row"] = 0
571
+ j = max(0, i - 12)
572
+ while j < min(i + 12, len(self.boxes)):
573
+ if j == i:
574
+ j += 1
575
+ continue
576
+ ydis = self._y_dis(self.boxes[i], self.boxes[j]) / mh
577
+ if abs(ydis) < 1:
578
+ self.boxes[i]["in_row"] += 1
579
+ elif ydis > 0:
580
+ break
581
+ j += 1
582
+
583
+ def gather(kwd, fzy=10, ption=0.6):
584
+ eles = self.sort_Y_firstly(
585
+ [r for r in self.tb_cpns if re.match(kwd, r["label"])], fzy)
586
+ eles = self.__layouts_cleanup(self.boxes, eles, 5, ption)
587
+ return self.sort_Y_firstly(eles, 0)
588
+
589
+ headers = gather(r".*header$")
590
+ rows = gather(r".* (row|header)")
591
+ spans = gather(r".*spanning")
592
+ clmns = sorted([r for r in self.tb_cpns if re.match(
593
+ r"table column$", r["label"])], key=lambda x: (x["pn"], x["layoutno"], x["x0"]))
594
+ clmns = self.__layouts_cleanup(self.boxes, clmns, 5, 0.5)
595
+ for b in self.boxes:
596
+ if b.get("layout_type", "") != "table":
597
+ continue
598
+ ii = self.__find_overlapped_with_threashold(b, rows, thr=0.3)
599
+ if ii is not None:
600
+ b["R"] = ii
601
+ b["R_top"] = rows[ii]["top"]
602
+ b["R_bott"] = rows[ii]["bottom"]
603
+
604
+ ii = self.__find_overlapped_with_threashold(b, headers, thr=0.3)
605
+ if ii is not None:
606
+ b["H_top"] = headers[ii]["top"]
607
+ b["H_bott"] = headers[ii]["bottom"]
608
+ b["H_left"] = headers[ii]["x0"]
609
+ b["H_right"] = headers[ii]["x1"]
610
+ b["H"] = ii
611
+
612
+ ii = self.__find_overlapped_with_threashold(b, clmns, thr=0.3)
613
+ if ii is not None:
614
+ b["C"] = ii
615
+ b["C_left"] = clmns[ii]["x0"]
616
+ b["C_right"] = clmns[ii]["x1"]
617
+
618
+ ii = self.__find_overlapped_with_threashold(b, spans, thr=0.3)
619
+ if ii is not None:
620
+ b["H_top"] = spans[ii]["top"]
621
+ b["H_bott"] = spans[ii]["bottom"]
622
+ b["H_left"] = spans[ii]["x0"]
623
+ b["H_right"] = spans[ii]["x1"]
624
+ b["SP"] = ii
625
+
626
+ # concat between rows
627
+ boxes = deepcopy(self.boxes)
628
+ blocks = []
629
+ while boxes:
630
+ chunks = []
631
+
632
+ def dfs(up, dp):
633
+ if not up["text"].strip() or up["text"].strip() in garbage:
634
+ return
635
+ chunks.append(up)
636
+ i = dp
637
+ while i < min(dp + 12, len(boxes)):
638
+ ydis = self._y_dis(up, boxes[i])
639
+ smpg = up["page_number"] == boxes[i]["page_number"]
640
+ mh = self.mean_height[up["page_number"] - 1]
641
+ mw = self.mean_width[up["page_number"] - 1]
642
+ if smpg and ydis > mh * 4:
643
+ break
644
+ if not smpg and ydis > mh * 16:
645
+ break
646
+ down = boxes[i]
647
+
648
+ if up.get("R", "") != down.get(
649
+ "R", "") and up["text"][-1] != ",":
650
+ i += 1
651
+ continue
652
+
653
+ if re.match(r"[0-9]{2,3}/[0-9]{3}$", up["text"]) \
654
+ or re.match(r"[0-9]{2,3}/[0-9]{3}$", down["text"]):
655
+ i += 1
656
+ continue
657
+
658
+ if not down["text"].strip() \
659
+ or down["text"].strip() in garbage:
660
+ i += 1
661
+ continue
662
+
663
+ if up["x1"] < down["x0"] - 10 * \
664
+ mw or up["x0"] > down["x1"] + 10 * mw:
665
+ i += 1
666
+ continue
667
+
668
+ if i - dp < 5 and up.get("layout_type") == "text":
669
+ if up.get("layoutno", "1") == down.get(
670
+ "layoutno", "2"):
671
+ dfs(down, i + 1)
672
+ boxes.pop(i)
673
+ return
674
+ i += 1
675
+ continue
676
+
677
+ fea = self._updown_concat_features(up, down)
678
+ if self.updown_cnt_mdl.predict(
679
+ xgb.DMatrix([fea]))[0] <= 0.5:
680
+ i += 1
681
+ continue
682
+ dfs(down, i + 1)
683
+ boxes.pop(i)
684
+ return
685
+
686
+ dfs(boxes[0], 1)
687
+ boxes.pop(0)
688
+ if chunks:
689
+ blocks.append(chunks)
690
+
691
+ # concat within each block
692
+ boxes = []
693
+ for b in blocks:
694
+ if len(b) == 1:
695
+ boxes.append(b[0])
696
+ continue
697
+ t = b[0]
698
+ for c in b[1:]:
699
+ t["text"] = t["text"].strip()
700
+ c["text"] = c["text"].strip()
701
+ if not c["text"]:
702
+ continue
703
+ if t["text"] and re.match(
704
+ r"[0-9\.a-zA-Z]+$", t["text"][-1] + c["text"][-1]):
705
+ t["text"] += " "
706
+ t["text"] += c["text"]
707
+ t["x0"] = min(t["x0"], c["x0"])
708
+ t["x1"] = max(t["x1"], c["x1"])
709
+ t["page_number"] = min(t["page_number"], c["page_number"])
710
+ t["bottom"] = c["bottom"]
711
+ if not t["layout_type"] \
712
+ and c["layout_type"]:
713
+ t["layout_type"] = c["layout_type"]
714
+ boxes.append(t)
715
+
716
+ self.boxes = self.sort_Y_firstly(boxes, 0)
717
+
718
+ def __filter_forpages(self):
719
+ if not self.boxes:
720
+ return
721
+ to = min(7, len(self.page_images) // 5)
722
+ pg_hits = [0 for _ in range(to)]
723
+
724
+ def possible(c):
725
+ if c.get("layout_type", "") == "reference":
726
+ return True
727
+ if c["bottom"] - c["top"] >= 2 * \
728
+ self.mean_height[c["page_number"] - 1]:
729
+ return False
730
+ if c["text"].find("....") >= 0 \
731
+ or (c["x1"] - c["x0"] > 250 and re.search(r"[0-9]+$",
732
+ c["text"].strip())):
733
+ return True
734
+ return self.is_caption(c) and re.search(
735
+ r"[0-9]+$", c["text"].strip())
736
+
737
+ for c in self.boxes:
738
+ if c["page_number"] >= to:
739
+ break
740
+ if possible(c):
741
+ pg_hits[c["page_number"] - 1] += 1
742
+
743
+ st, ed = -1, -1
744
+ for i in range(len(self.boxes)):
745
+ c = self.boxes[i]
746
+ if c["page_number"] >= to:
747
+ break
748
+ if pg_hits[c["page_number"] - 1] >= 3 and possible(c):
749
+ if st < 0:
750
+ st = i
751
+ else:
752
+ ed = i
753
+ for _ in range(st, ed + 1):
754
+ self.boxes.pop(st)
755
+
756
+ def _blockType(self, b):
757
+ patt = [
758
+ ("^(20|19)[0-9]{2}[年/-][0-9]{1,2}[月/-][0-9]{1,2}日*$", "Dt"),
759
+ (r"^(20|19)[0-9]{2}年$", "Dt"),
760
+ (r"^(20|19)[0-9]{2}[年-][0-9]{1,2}月*$", "Dt"),
761
+ ("^[0-9]{1,2}[月-][0-9]{1,2}日*$", "Dt"),
762
+ (r"^第*[一二三四1-4]季度$", "Dt"),
763
+ (r"^(20|19)[0-9]{2}年*[一二三四1-4]季度$", "Dt"),
764
+ (r"^(20|19)[0-9]{2}[ABCDE]$", "Dt"),
765
+ ("^[0-9.,+%/ -]+$", "Nu"),
766
+ (r"^[0-9A-Z/\._~-]+$", "Ca"),
767
+ (r"^[A-Z]*[a-z' -]+$", "En"),
768
+ (r"^[0-9.,+-]+[0-9A-Za-z/$¥%<>()()' -]+$", "NE"),
769
+ (r"^.{1}$", "Sg")
770
+ ]
771
+ for p, n in patt:
772
+ if re.search(p, b["text"].strip()):
773
+ return n
774
+ tks = [t for t in huqie.qie(b["text"]).split(" ") if len(t) > 1]
775
+ if len(tks) > 3:
776
+ if len(tks) < 12:
777
+ return "Tx"
778
+ else:
779
+ return "Lx"
780
+
781
+ if len(tks) == 1 and huqie.tag(tks[0]) == "nr":
782
+ return "Nr"
783
+
784
+ return "Ot"
785
+
786
+ def __cal_spans(self, boxes, rows, cols, tbl, html=True):
787
+ # caculate span
788
+ clft = [np.mean([c.get("C_left", c["x0"]) for c in cln])
789
+ for cln in cols]
790
+ crgt = [np.mean([c.get("C_right", c["x1"]) for c in cln])
791
+ for cln in cols]
792
+ rtop = [np.mean([c.get("R_top", c["top"]) for c in row])
793
+ for row in rows]
794
+ rbtm = [np.mean([c.get("R_btm", c["bottom"])
795
+ for c in row]) for row in rows]
796
+ for b in boxes:
797
+ if "SP" not in b:
798
+ continue
799
+ b["colspan"] = [b["cn"]]
800
+ b["rowspan"] = [b["rn"]]
801
+ # col span
802
+ for j in range(0, len(clft)):
803
+ if j == b["cn"]:
804
+ continue
805
+ if clft[j] + (crgt[j] - clft[j]) / 2 < b["H_left"]:
806
+ continue
807
+ if crgt[j] - (crgt[j] - clft[j]) / 2 > b["H_right"]:
808
+ continue
809
+ b["colspan"].append(j)
810
+ # row span
811
+ for j in range(0, len(rtop)):
812
+ if j == b["rn"]:
813
+ continue
814
+ if rtop[j] + (rbtm[j] - rtop[j]) / 2 < b["H_top"]:
815
+ continue
816
+ if rbtm[j] - (rbtm[j] - rtop[j]) / 2 > b["H_bott"]:
817
+ continue
818
+ b["rowspan"].append(j)
819
+
820
+ def join(arr):
821
+ if not arr:
822
+ return ""
823
+ return "".join([t["text"] for t in arr])
824
+
825
+ # rm the spaning cells
826
+ for i in range(len(tbl)):
827
+ for j, arr in enumerate(tbl[i]):
828
+ if not arr:
829
+ continue
830
+ if all(["rowspan" not in a and "colspan" not in a for a in arr]):
831
+ continue
832
+ rowspan, colspan = [], []
833
+ for a in arr:
834
+ if isinstance(a.get("rowspan", 0), list):
835
+ rowspan.extend(a["rowspan"])
836
+ if isinstance(a.get("colspan", 0), list):
837
+ colspan.extend(a["colspan"])
838
+ rowspan, colspan = set(rowspan), set(colspan)
839
+ if len(rowspan) < 2 and len(colspan) < 2:
840
+ for a in arr:
841
+ if "rowspan" in a:
842
+ del a["rowspan"]
843
+ if "colspan" in a:
844
+ del a["colspan"]
845
+ continue
846
+ rowspan, colspan = sorted(rowspan), sorted(colspan)
847
+ rowspan = list(range(rowspan[0], rowspan[-1] + 1))
848
+ colspan = list(range(colspan[0], colspan[-1] + 1))
849
+ assert i in rowspan, rowspan
850
+ assert j in colspan, colspan
851
+ arr = []
852
+ for r in rowspan:
853
+ for c in colspan:
854
+ arr_txt = join(arr)
855
+ if tbl[r][c] and join(tbl[r][c]) != arr_txt:
856
+ arr.extend(tbl[r][c])
857
+ tbl[r][c] = None if html else arr
858
+ for a in arr:
859
+ if len(rowspan) > 1:
860
+ a["rowspan"] = len(rowspan)
861
+ elif "rowspan" in a:
862
+ del a["rowspan"]
863
+ if len(colspan) > 1:
864
+ a["colspan"] = len(colspan)
865
+ elif "colspan" in a:
866
+ del a["colspan"]
867
+ tbl[rowspan[0]][colspan[0]] = arr
868
+
869
+ return tbl
870
+
871
+ def __construct_table(self, boxes, html=False):
872
+ cap = ""
873
+ i = 0
874
+ while i < len(boxes):
875
+ if self.is_caption(boxes[i]):
876
+ cap += boxes[i]["text"]
877
+ boxes.pop(i)
878
+ i -= 1
879
+ i += 1
880
+
881
+ if not boxes:
882
+ return []
883
+ for b in boxes:
884
+ b["btype"] = self._blockType(b)
885
+ max_type = Counter([b["btype"] for b in boxes]).items()
886
+ max_type = max(max_type, key=lambda x: x[1])[0] if max_type else ""
887
+ logging.debug("MAXTYPE: " + max_type)
888
+
889
+ rowh = [b["R_bott"] - b["R_top"] for b in boxes if "R" in b]
890
+ rowh = np.min(rowh) if rowh else 0
891
+ # boxes = self.sort_Y_firstly(boxes, rowh/5)
892
+ boxes = self.sort_R_firstly(boxes, rowh / 2)
893
+ boxes[0]["rn"] = 0
894
+ rows = [[boxes[0]]]
895
+ btm = boxes[0]["bottom"]
896
+ for b in boxes[1:]:
897
+ b["rn"] = len(rows) - 1
898
+ lst_r = rows[-1]
899
+ if lst_r[-1].get("R", "") != b.get("R", "") \
900
+ or (b["top"] >= btm - 3 and lst_r[-1].get("R", "-1") != b.get("R", "-2")
901
+ ): # new row
902
+ btm = b["bottom"]
903
+ b["rn"] += 1
904
+ rows.append([b])
905
+ continue
906
+ btm = (btm + b["bottom"]) / 2.
907
+ rows[-1].append(b)
908
+
909
+ colwm = [b["C_right"] - b["C_left"] for b in boxes if "C" in b]
910
+ colwm = np.min(colwm) if colwm else 0
911
+ crosspage = len(set([b["page_number"] for b in boxes])) > 1
912
+ if crosspage:
913
+ boxes = self.sort_X_firstly(boxes, colwm / 2, False)
914
+ else:
915
+ boxes = self.sort_C_firstly(boxes, colwm / 2)
916
+ boxes[0]["cn"] = 0
917
+ cols = [[boxes[0]]]
918
+ right = boxes[0]["x1"]
919
+ for b in boxes[1:]:
920
+ b["cn"] = len(cols) - 1
921
+ lst_c = cols[-1]
922
+ if (int(b.get("C", "1")) - int(lst_c[-1].get("C", "1")) == 1 and b["page_number"] == lst_c[-1][
923
+ "page_number"]) \
924
+ or (b["x0"] >= right and lst_c[-1].get("C", "-1") != b.get("C", "-2")): # new col
925
+ right = b["x1"]
926
+ b["cn"] += 1
927
+ cols.append([b])
928
+ continue
929
+ right = (right + b["x1"]) / 2.
930
+ cols[-1].append(b)
931
+
932
+ tbl = [[[] for _ in range(len(cols))] for _ in range(len(rows))]
933
+ for b in boxes:
934
+ tbl[b["rn"]][b["cn"]].append(b)
935
+
936
+ if len(rows) >= 4:
937
+ # remove single in column
938
+ j = 0
939
+ while j < len(tbl[0]):
940
+ e, ii = 0, 0
941
+ for i in range(len(tbl)):
942
+ if tbl[i][j]:
943
+ e += 1
944
+ ii = i
945
+ if e > 1:
946
+ break
947
+ if e > 1:
948
+ j += 1
949
+ continue
950
+ f = (j > 0 and tbl[ii][j - 1] and tbl[ii]
951
+ [j - 1][0].get("text")) or j == 0
952
+ ff = (j + 1 < len(tbl[ii]) and tbl[ii][j + 1] and tbl[ii]
953
+ [j + 1][0].get("text")) or j + 1 >= len(tbl[ii])
954
+ if f and ff:
955
+ j += 1
956
+ continue
957
+ bx = tbl[ii][j][0]
958
+ logging.debug("Relocate column single: " + bx["text"])
959
+ # j column only has one value
960
+ left, right = 100000, 100000
961
+ if j > 0 and not f:
962
+ for i in range(len(tbl)):
963
+ if tbl[i][j - 1]:
964
+ left = min(left, np.min(
965
+ [bx["x0"] - a["x1"] for a in tbl[i][j - 1]]))
966
+ if j + 1 < len(tbl[0]) and not ff:
967
+ for i in range(len(tbl)):
968
+ if tbl[i][j + 1]:
969
+ right = min(right, np.min(
970
+ [a["x0"] - bx["x1"] for a in tbl[i][j + 1]]))
971
+ assert left < 100000 or right < 100000
972
+ if left < right:
973
+ for jj in range(j, len(tbl[0])):
974
+ for i in range(len(tbl)):
975
+ for a in tbl[i][jj]:
976
+ a["cn"] -= 1
977
+ if tbl[ii][j - 1]:
978
+ tbl[ii][j - 1].extend(tbl[ii][j])
979
+ else:
980
+ tbl[ii][j - 1] = tbl[ii][j]
981
+ for i in range(len(tbl)):
982
+ tbl[i].pop(j)
983
+
984
+ else:
985
+ for jj in range(j + 1, len(tbl[0])):
986
+ for i in range(len(tbl)):
987
+ for a in tbl[i][jj]:
988
+ a["cn"] -= 1
989
+ if tbl[ii][j + 1]:
990
+ tbl[ii][j + 1].extend(tbl[ii][j])
991
+ else:
992
+ tbl[ii][j + 1] = tbl[ii][j]
993
+ for i in range(len(tbl)):
994
+ tbl[i].pop(j)
995
+ cols.pop(j)
996
+ assert len(cols) == len(tbl[0]), "Column NO. miss matched: %d vs %d" % (
997
+ len(cols), len(tbl[0]))
998
+
999
+ if len(cols) >= 4:
1000
+ # remove single in row
1001
+ i = 0
1002
+ while i < len(tbl):
1003
+ e, jj = 0, 0
1004
+ for j in range(len(tbl[i])):
1005
+ if tbl[i][j]:
1006
+ e += 1
1007
+ jj = j
1008
+ if e > 1:
1009
+ break
1010
+ if e > 1:
1011
+ i += 1
1012
+ continue
1013
+ f = (i > 0 and tbl[i - 1][jj] and tbl[i - 1]
1014
+ [jj][0].get("text")) or i == 0
1015
+ ff = (i + 1 < len(tbl) and tbl[i + 1][jj] and tbl[i + 1]
1016
+ [jj][0].get("text")) or i + 1 >= len(tbl)
1017
+ if f and ff:
1018
+ i += 1
1019
+ continue
1020
+
1021
+ bx = tbl[i][jj][0]
1022
+ logging.debug("Relocate row single: " + bx["text"])
1023
+ # i row only has one value
1024
+ up, down = 100000, 100000
1025
+ if i > 0 and not f:
1026
+ for j in range(len(tbl[i - 1])):
1027
+ if tbl[i - 1][j]:
1028
+ up = min(up, np.min(
1029
+ [bx["top"] - a["bottom"] for a in tbl[i - 1][j]]))
1030
+ if i + 1 < len(tbl) and not ff:
1031
+ for j in range(len(tbl[i + 1])):
1032
+ if tbl[i + 1][j]:
1033
+ down = min(down, np.min(
1034
+ [a["top"] - bx["bottom"] for a in tbl[i + 1][j]]))
1035
+ assert up < 100000 or down < 100000
1036
+ if up < down:
1037
+ for ii in range(i, len(tbl)):
1038
+ for j in range(len(tbl[ii])):
1039
+ for a in tbl[ii][j]:
1040
+ a["rn"] -= 1
1041
+ if tbl[i - 1][jj]:
1042
+ tbl[i - 1][jj].extend(tbl[i][jj])
1043
+ else:
1044
+ tbl[i - 1][jj] = tbl[i][jj]
1045
+ tbl.pop(i)
1046
+
1047
+ else:
1048
+ for ii in range(i + 1, len(tbl)):
1049
+ for j in range(len(tbl[ii])):
1050
+ for a in tbl[ii][j]:
1051
+ a["rn"] -= 1
1052
+ if tbl[i + 1][jj]:
1053
+ tbl[i + 1][jj].extend(tbl[i][jj])
1054
+ else:
1055
+ tbl[i + 1][jj] = tbl[i][jj]
1056
+ tbl.pop(i)
1057
+ rows.pop(i)
1058
+
1059
+ # which rows are headers
1060
+ hdset = set([])
1061
+ for i in range(len(tbl)):
1062
+ cnt, h = 0, 0
1063
+ for j, arr in enumerate(tbl[i]):
1064
+ if not arr:
1065
+ continue
1066
+ cnt += 1
1067
+ if max_type == "Nu" and arr[0]["btype"] == "Nu":
1068
+ continue
1069
+ if any([a.get("H") for a in arr]) \
1070
+ or (max_type == "Nu" and arr[0]["btype"] != "Nu"):
1071
+ h += 1
1072
+ if h / cnt > 0.5:
1073
+ hdset.add(i)
1074
+
1075
+ if html:
1076
+ return [self.__html_table(cap, hdset,
1077
+ self.__cal_spans(boxes, rows,
1078
+ cols, tbl, True)
1079
+ )]
1080
+
1081
+ return self.__desc_table(cap, hdset,
1082
+ self.__cal_spans(boxes, rows, cols, tbl, False))
1083
+
1084
+ def __html_table(self, cap, hdset, tbl):
1085
+ # constrcut HTML
1086
+ html = "<table>"
1087
+ if cap:
1088
+ html += f"<caption>{cap}</caption>"
1089
+ for i in range(len(tbl)):
1090
+ row = "<tr>"
1091
+ txts = []
1092
+ for j, arr in enumerate(tbl[i]):
1093
+ if arr is None:
1094
+ continue
1095
+ if not arr:
1096
+ row += "<td></td>" if i not in hdset else "<th></th>"
1097
+ continue
1098
+ txt = ""
1099
+ if arr:
1100
+ h = min(np.min([c["bottom"] - c["top"] for c in arr]) / 2,
1101
+ self.mean_height[arr[0]["page_number"] - 1] / 2)
1102
+ txt = "".join([c["text"]
1103
+ for c in self.sort_Y_firstly(arr, h)])
1104
+ txts.append(txt)
1105
+ sp = ""
1106
+ if arr[0].get("colspan"):
1107
+ sp = "colspan={}".format(arr[0]["colspan"])
1108
+ if arr[0].get("rowspan"):
1109
+ sp += " rowspan={}".format(arr[0]["rowspan"])
1110
+ if i in hdset:
1111
+ row += f"<th {sp} >" + txt + "</th>"
1112
+ else:
1113
+ row += f"<td {sp} >" + txt + "</td>"
1114
+
1115
+ if i in hdset:
1116
+ if all([t in hdset for t in txts]):
1117
+ continue
1118
+ for t in txts:
1119
+ hdset.add(t)
1120
+
1121
+ if row != "<tr>":
1122
+ row += "</tr>"
1123
+ else:
1124
+ row = ""
1125
+ html += "\n" + row
1126
+ html += "\n</table>"
1127
+ return html
1128
+
1129
+ def __desc_table(self, cap, hdr_rowno, tbl):
1130
+ # get text of every colomn in header row to become header text
1131
+ clmno = len(tbl[0])
1132
+ rowno = len(tbl)
1133
+ headers = {}
1134
+ hdrset = set()
1135
+ lst_hdr = []
1136
+ for r in sorted(list(hdr_rowno)):
1137
+ headers[r] = ["" for _ in range(clmno)]
1138
+ for i in range(clmno):
1139
+ if not tbl[r][i]:
1140
+ continue
1141
+ txt = "".join([a["text"].strip() for a in tbl[r][i]])
1142
+ headers[r][i] = txt
1143
+ hdrset.add(txt)
1144
+ if all([not t for t in headers[r]]):
1145
+ del headers[r]
1146
+ hdr_rowno.remove(r)
1147
+ continue
1148
+ for j in range(clmno):
1149
+ if headers[r][j]:
1150
+ continue
1151
+ if j >= len(lst_hdr):
1152
+ break
1153
+ headers[r][j] = lst_hdr[j]
1154
+ lst_hdr = headers[r]
1155
+ for i in range(rowno):
1156
+ if i not in hdr_rowno:
1157
+ continue
1158
+ for j in range(i + 1, rowno):
1159
+ if j not in hdr_rowno:
1160
+ break
1161
+ for k in range(clmno):
1162
+ if not headers[j - 1][k]:
1163
+ continue
1164
+ if headers[j][k].find(headers[j - 1][k]) >= 0:
1165
+ continue
1166
+ if len(headers[j][k]) > len(headers[j - 1][k]):
1167
+ headers[j][k] += ("的" if headers[j][k]
1168
+ else "") + headers[j - 1][k]
1169
+ else:
1170
+ headers[j][k] = headers[j - 1][k] \
1171
+ + ("的" if headers[j - 1][k] else "") \
1172
+ + headers[j][k]
1173
+
1174
+ logging.debug(
1175
+ f">>>>>>>>>>>>>>>>>{cap}:SIZE:{rowno}X{clmno} Header: {hdr_rowno}")
1176
+ row_txt = []
1177
+ for i in range(rowno):
1178
+ if i in hdr_rowno:
1179
+ continue
1180
+ rtxt = []
1181
+
1182
+ def append(delimer):
1183
+ nonlocal rtxt, row_txt
1184
+ rtxt = delimer.join(rtxt)
1185
+ if row_txt and len(row_txt[-1]) + len(rtxt) < 64:
1186
+ row_txt[-1] += "\n" + rtxt
1187
+ else:
1188
+ row_txt.append(rtxt)
1189
+
1190
+ r = 0
1191
+ if len(headers.items()):
1192
+ _arr = [(i - r, r) for r, _ in headers.items() if r < i]
1193
+ if _arr:
1194
+ _, r = min(_arr, key=lambda x: x[0])
1195
+
1196
+ if r not in headers and clmno <= 2:
1197
+ for j in range(clmno):
1198
+ if not tbl[i][j]:
1199
+ continue
1200
+ txt = "".join([a["text"].strip() for a in tbl[i][j]])
1201
+ if txt:
1202
+ rtxt.append(txt)
1203
+ if rtxt:
1204
+ append(":")
1205
+ continue
1206
+
1207
+ for j in range(clmno):
1208
+ if not tbl[i][j]:
1209
+ continue
1210
+ txt = "".join([a["text"].strip() for a in tbl[i][j]])
1211
+ if not txt:
1212
+ continue
1213
+ ctt = headers[r][j] if r in headers else ""
1214
+ if ctt:
1215
+ ctt += ":"
1216
+ ctt += txt
1217
+ if ctt:
1218
+ rtxt.append(ctt)
1219
+
1220
+ if rtxt:
1221
+ row_txt.append("; ".join(rtxt))
1222
+
1223
+ if cap:
1224
+ row_txt = [t + f"\t——来自“{cap}”" for t in row_txt]
1225
+ return row_txt
1226
+
1227
+ @staticmethod
1228
+ def is_caption(bx):
1229
+ patt = [
1230
+ r"[图表]+[ 0-9::]{2,}"
1231
+ ]
1232
+ if any([re.match(p, bx["text"].strip()) for p in patt]) \
1233
+ or bx["layout_type"].find("caption") >= 0:
1234
+ return True
1235
+ return False
1236
+
1237
+ def __extract_table_figure(self, need_image, ZM, return_html):
1238
+ tables = {}
1239
+ figures = {}
1240
+ # extract figure and table boxes
1241
+ i = 0
1242
+ lst_lout_no = ""
1243
+ nomerge_lout_no = []
1244
+ while i < len(self.boxes):
1245
+ if "layoutno" not in self.boxes[i]:
1246
+ i += 1
1247
+ continue
1248
+ lout_no = str(self.boxes[i]["page_number"]) + \
1249
+ "-" + str(self.boxes[i]["layoutno"])
1250
+ if self.is_caption(self.boxes[i]) or self.boxes[i]["layout_type"] in ["table caption", "title",
1251
+ "figure caption", "reference"]:
1252
+ nomerge_lout_no.append(lst_lout_no)
1253
+ if self.boxes[i]["layout_type"] == "table":
1254
+ if re.match(r"(数据|资料|图表)*来源[:: ]", self.boxes[i]["text"]):
1255
+ self.boxes.pop(i)
1256
+ continue
1257
+ if lout_no not in tables:
1258
+ tables[lout_no] = []
1259
+ tables[lout_no].append(self.boxes[i])
1260
+ self.boxes.pop(i)
1261
+ lst_lout_no = lout_no
1262
+ continue
1263
+ if need_image and self.boxes[i]["layout_type"] == "figure":
1264
+ if re.match(r"(数据|资料|图表)*来源[:: ]", self.boxes[i]["text"]):
1265
+ self.boxes.pop(i)
1266
+ continue
1267
+ if lout_no not in figures:
1268
+ figures[lout_no] = []
1269
+ figures[lout_no].append(self.boxes[i])
1270
+ self.boxes.pop(i)
1271
+ lst_lout_no = lout_no
1272
+ continue
1273
+ i += 1
1274
+
1275
+ # merge table on different pages
1276
+ nomerge_lout_no = set(nomerge_lout_no)
1277
+ tbls = sorted([(k, bxs) for k, bxs in tables.items()],
1278
+ key=lambda x: (x[1][0]["top"], x[1][0]["x0"]))
1279
+
1280
+ i = len(tbls) - 1
1281
+ while i - 1 >= 0:
1282
+ k0, bxs0 = tbls[i - 1]
1283
+ k, bxs = tbls[i]
1284
+ i -= 1
1285
+ if k0 in nomerge_lout_no:
1286
+ continue
1287
+ if bxs[0]["page_number"] == bxs0[0]["page_number"]:
1288
+ continue
1289
+ if bxs[0]["page_number"] - bxs0[0]["page_number"] > 1:
1290
+ continue
1291
+ mh = self.mean_height[bxs[0]["page_number"] - 1]
1292
+ if self._y_dis(bxs0[-1], bxs[0]) > mh * 23:
1293
+ continue
1294
+ tables[k0].extend(tables[k])
1295
+ del tables[k]
1296
+
1297
+ def x_overlapped(a, b):
1298
+ return not any([a["x1"] < b["x0"], a["x0"] > b["x1"]])
1299
+
1300
+ # find captions and pop out
1301
+ i = 0
1302
+ while i < len(self.boxes):
1303
+ c = self.boxes[i]
1304
+ # mh = self.mean_height[c["page_number"]-1]
1305
+ if not self.is_caption(c):
1306
+ i += 1
1307
+ continue
1308
+
1309
+ # find the nearest layouts
1310
+ def nearest(tbls):
1311
+ nonlocal c
1312
+ mink = ""
1313
+ minv = 1000000000
1314
+ for k, bxs in tbls.items():
1315
+ for b in bxs[:10]:
1316
+ if b.get("layout_type", "").find("caption") >= 0:
1317
+ continue
1318
+ y_dis = self._y_dis(c, b)
1319
+ x_dis = self._x_dis(
1320
+ c, b) if not x_overlapped(
1321
+ c, b) else 0
1322
+ dis = y_dis * y_dis + x_dis * x_dis
1323
+ if dis < minv:
1324
+ mink = k
1325
+ minv = dis
1326
+ return mink, minv
1327
+
1328
+ tk, tv = nearest(tables)
1329
+ fk, fv = nearest(figures)
1330
+ if min(tv, fv) > 2000:
1331
+ i += 1
1332
+ continue
1333
+ if tv < fv:
1334
+ tables[tk].insert(0, c)
1335
+ logging.debug(
1336
+ "TABLE:" +
1337
+ self.boxes[i]["text"] +
1338
+ "; Cap: " +
1339
+ tk)
1340
+ else:
1341
+ figures[fk].insert(0, c)
1342
+ logging.debug(
1343
+ "FIGURE:" +
1344
+ self.boxes[i]["text"] +
1345
+ "; Cap: " +
1346
+ tk)
1347
+ self.boxes.pop(i)
1348
+
1349
+ res = []
1350
+
1351
+ def cropout(bxs, ltype):
1352
+ nonlocal ZM
1353
+ pn = set([b["page_number"] - 1 for b in bxs])
1354
+ if len(pn) < 2:
1355
+ pn = list(pn)[0]
1356
+ ht = self.page_cum_height[pn]
1357
+ b = {
1358
+ "x0": np.min([b["x0"] for b in bxs]),
1359
+ "top": np.min([b["top"] for b in bxs]) - ht,
1360
+ "x1": np.max([b["x1"] for b in bxs]),
1361
+ "bottom": np.max([b["bottom"] for b in bxs]) - ht
1362
+ }
1363
+ louts = [l for l in self.page_layout[pn] if l["type"] == ltype]
1364
+ ii = self.__find_overlapped(b, louts, naive=True)
1365
+ if ii is not None:
1366
+ b = louts[ii]
1367
+ else:
1368
+ logging.warn(
1369
+ f"Missing layout match: {pn + 1},%s" %
1370
+ (bxs[0].get(
1371
+ "layoutno", "")))
1372
+
1373
+ left, top, right, bott = b["x0"], b["top"], b["x1"], b["bottom"]
1374
+ return self.page_images[pn] \
1375
+ .crop((left * ZM, top * ZM,
1376
+ right * ZM, bott * ZM))
1377
+ pn = {}
1378
+ for b in bxs:
1379
+ p = b["page_number"] - 1
1380
+ if p not in pn:
1381
+ pn[p] = []
1382
+ pn[p].append(b)
1383
+ pn = sorted(pn.items(), key=lambda x: x[0])
1384
+ imgs = [cropout(arr, ltype) for p, arr in pn]
1385
+ pic = Image.new("RGB",
1386
+ (int(np.max([i.size[0] for i in imgs])),
1387
+ int(np.sum([m.size[1] for m in imgs]))),
1388
+ (245, 245, 245))
1389
+ height = 0
1390
+ for img in imgs:
1391
+ pic.paste(img, (0, int(height)))
1392
+ height += img.size[1]
1393
+ return pic
1394
+
1395
+ # crop figure out and add caption
1396
+ for k, bxs in figures.items():
1397
+ txt = "\n".join(
1398
+ [b["text"] for b in bxs
1399
+ if not re.match(r"[0-9a-z.\+%-]", b["text"].strip())
1400
+ and len(b["text"].strip()) >= 4
1401
+ ]
1402
+ )
1403
+ if not txt:
1404
+ continue
1405
+
1406
+ res.append(
1407
+ (cropout(
1408
+ bxs,
1409
+ "figure"),
1410
+ [txt] if not return_html else [f"<p>{txt}</p>"]))
1411
+
1412
+ for k, bxs in tables.items():
1413
+ if not bxs:
1414
+ continue
1415
+ res.append((cropout(bxs, "table"),
1416
+ self.__construct_table(bxs, html=return_html)))
1417
+
1418
+ return res
1419
+
1420
+ def proj_match(self, line):
1421
+ if len(line) <= 2:
1422
+ return
1423
+ if re.match(r"[0-9 ().,%%+/-]+$", line):
1424
+ return False
1425
+ for p, j in [
1426
+ (r"第[零一二三四五六七八九十百]+章", 1),
1427
+ (r"第[零一二三四五六七八九十百]+[条节]", 2),
1428
+ (r"[零一二三四五六七八九十百]+[、  ]", 3),
1429
+ (r"[\((][零一二三四五六七八九十百]+[)\)]", 4),
1430
+ (r"[0-9]+(、|\.[  ]|\.[^0-9])", 5),
1431
+ (r"[0-9]+\.[0-9]+(、|[.  ]|[^0-9])", 6),
1432
+ (r"[0-9]+\.[0-9]+\.[0-9]+(、|[  ]|[^0-9])", 7),
1433
+ (r"[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+(、|[  ]|[^0-9])", 8),
1434
+ (r".{,48}[::??]$", 9),
1435
+ (r"[0-9]+)", 10),
1436
+ (r"[\((][0-9]+[)\)]", 11),
1437
+ (r"[零一二三四五六七八九十百]+是", 12),
1438
+ (r"[⚫•➢✓]", 12)
1439
+ ]:
1440
+ if re.match(p, line):
1441
+ return j
1442
+ return
1443
+
1444
+ def __filterout_scraps(self, boxes, ZM):
1445
+ def line_tag(bx):
1446
+ pn = [bx["page_number"]]
1447
+ top = bx["top"] - self.page_cum_height[pn[0] - 1]
1448
+ bott = bx["bottom"] - self.page_cum_height[pn[0] - 1]
1449
+ while bott * ZM > self.page_images[pn[-1] - 1].size[1]:
1450
+ bott -= self.page_images[pn[-1] - 1].size[1] / ZM
1451
+ pn.append(pn[-1] + 1)
1452
+
1453
+ return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##" \
1454
+ .format("-".join([str(p) for p in pn]),
1455
+ bx["x0"], bx["x1"], top, bott)
1456
+
1457
+ def width(b):
1458
+ return b["x1"] - b["x0"]
1459
+
1460
+ def height(b):
1461
+ return b["bottom"] - b["top"]
1462
+
1463
+ def usefull(b):
1464
+ if b.get("layout_type"):
1465
+ return True
1466
+ if width(
1467
+ b) > self.page_images[b["page_number"] - 1].size[0] / ZM / 3:
1468
+ return True
1469
+ if b["bottom"] - b["top"] > self.mean_height[b["page_number"] - 1]:
1470
+ return True
1471
+ return False
1472
+
1473
+ res = []
1474
+ while boxes:
1475
+ lines = []
1476
+ widths = []
1477
+ pw = self.page_images[boxes[0]["page_number"] - 1].size[0] / ZM
1478
+ mh = self.mean_height[boxes[0]["page_number"] - 1]
1479
+ mj = self.proj_match(
1480
+ boxes[0]["text"]) or boxes[0].get(
1481
+ "layout_type",
1482
+ "") == "title"
1483
+
1484
+ def dfs(line, st):
1485
+ nonlocal mh, pw, lines, widths
1486
+ lines.append(line)
1487
+ widths.append(width(line))
1488
+ width_mean = np.mean(widths)
1489
+ mmj = self.proj_match(
1490
+ line["text"]) or line.get(
1491
+ "layout_type",
1492
+ "") == "title"
1493
+ for i in range(st + 1, min(st + 20, len(boxes))):
1494
+ if (boxes[i]["page_number"] - line["page_number"]) > 0:
1495
+ break
1496
+ if not mmj and self._y_dis(
1497
+ line, boxes[i]) >= 3 * mh and height(line) < 1.5 * mh:
1498
+ break
1499
+
1500
+ if not usefull(boxes[i]):
1501
+ continue
1502
+ if mmj or \
1503
+ (self._x_dis(boxes[i], line) < pw / 10): \
1504
+ # and abs(width(boxes[i])-width_mean)/max(width(boxes[i]),width_mean)<0.5):
1505
+ # concat following
1506
+ dfs(boxes[i], i)
1507
+ boxes.pop(i)
1508
+ break
1509
+
1510
+ try:
1511
+ if usefull(boxes[0]):
1512
+ dfs(boxes[0], 0)
1513
+ else:
1514
+ logging.debug("WASTE: " + boxes[0]["text"])
1515
+ except Exception as e:
1516
+ pass
1517
+ boxes.pop(0)
1518
+ mw = np.mean(widths)
1519
+ if mj or mw / pw >= 0.35 or mw > 200:
1520
+ res.append("\n".join([c["text"] + line_tag(c) for c in lines]))
1521
+ else:
1522
+ logging.debug("REMOVED: " +
1523
+ "<<".join([c["text"] for c in lines]))
1524
+
1525
+ return "\n\n".join(res)
1526
+
1527
+ def __call__(self, fnm, need_image=True, zoomin=3, return_html=False):
1528
+ self.pdf = pdfplumber.open(fnm)
1529
+ self.lefted_chars = []
1530
+ self.mean_height = []
1531
+ self.mean_width = []
1532
+ self.boxes = []
1533
+ self.garbages = {}
1534
+ self.page_cum_height = [0]
1535
+ self.page_layout = []
1536
+ self.page_images = [p.to_image(
1537
+ resolution=72 * zoomin).annotated for i, p in enumerate(self.pdf.pages[:299])]
1538
+ logging.info("Images converted.")
1539
+ logging.info("Table processed.")
1540
+
1541
+ for i, img in enumerate(self.page_images):
1542
+ chars = [c for c in self.pdf.pages[i].chars if self._has_color(c)]
1543
+ self.mean_height.append(
1544
+ np.median(sorted([c["height"] for c in chars])) if chars else 0
1545
+ )
1546
+ self.mean_width.append(
1547
+ np.median(sorted([c["width"] for c in chars])) if chars else 8
1548
+ )
1549
+ if i > 0:
1550
+ if not chars:
1551
+ self.page_cum_height.append(img.size[1] / zoomin)
1552
+ else:
1553
+ self.page_cum_height.append(
1554
+ np.max([c["bottom"] for c in chars]))
1555
+ self.__ocr_paddle(i + 1, img, chars, zoomin)
1556
+ self.__layouts_paddle(zoomin)
1557
+
1558
+ self.page_cum_height = np.cumsum(self.page_cum_height)
1559
+ assert len(self.page_cum_height) == len(self.page_images)
1560
+
1561
+ garbage = set()
1562
+ for k in self.garbages.keys():
1563
+ self.garbages[k] = Counter(self.garbages[k])
1564
+ for g, c in self.garbages[k].items():
1565
+ if c > 1:
1566
+ garbage.add(g)
1567
+
1568
+ logging.debug("GARBAGE:" + ",".join(garbage))
1569
+ self.boxes = [b for b in self.boxes if b["text"] not in garbage]
1570
+
1571
+ # cumlative Y
1572
+ for i in range(len(self.boxes)):
1573
+ self.boxes[i]["top"] += \
1574
+ self.page_cum_height[self.boxes[i]["page_number"] - 1]
1575
+ self.boxes[i]["bottom"] += \
1576
+ self.page_cum_height[self.boxes[i]["page_number"] - 1]
1577
+
1578
+ self.__table_transformer_job(zoomin)
1579
+ self.__text_merge(garbage)
1580
+ self.__filter_forpages()
1581
+ tbls = self.__extract_table_figure(need_image, zoomin, return_html)
1582
+
1583
+ return self.__filterout_scraps(deepcopy(self.boxes), zoomin), tbls
1584
+
1585
+ def remove_tag(self, txt):
1586
+ return re.sub(r"@@[\t0-9.-]+?##", "", txt)
1587
+
1588
+ def crop(self, text, ZM=3):
1589
+ imgs = []
1590
+ for tag in re.findall(r"@@[0-9-]+\t[0-9.\t]+##", text):
1591
+ pn, left, right, top, bottom = tag.strip(
1592
+ "#").strip("@").split("\t")
1593
+ left, right, top, bottom = float(left), float(
1594
+ right), float(top), float(bottom)
1595
+ bottom *= ZM
1596
+ pns = [int(p) - 1 for p in pn.split("-")]
1597
+ for pn in pns[1:]:
1598
+ bottom += self.page_images[pn - 1].size[1]
1599
+ imgs.append(
1600
+ self.page_images[pns[0]].crop((left * ZM, top * ZM,
1601
+ right *
1602
+ ZM, min(
1603
+ bottom, self.page_images[pns[0]].size[1])
1604
+ ))
1605
+ )
1606
+ bottom -= self.page_images[pns[0]].size[1]
1607
+ for pn in pns[1:]:
1608
+ imgs.append(
1609
+ self.page_images[pn].crop((left * ZM, 0,
1610
+ right * ZM,
1611
+ min(bottom,
1612
+ self.page_images[pn].size[1])
1613
+ ))
1614
+ )
1615
+ bottom -= self.page_images[pn].size[1]
1616
+
1617
+ if not imgs:
1618
+ return
1619
+ GAP = 2
1620
+ height = 0
1621
+ for img in imgs:
1622
+ height += img.size[1] + GAP
1623
+ height = int(height)
1624
+ pic = Image.new("RGB",
1625
+ (int(np.max([i.size[0] for i in imgs])), height),
1626
+ (245, 245, 245))
1627
+ height = 0
1628
+ for img in imgs:
1629
+ pic.paste(img, (0, int(height)))
1630
+ height += img.size[1] + GAP
1631
+ return pic
1632
+
1633
+
1634
+ if __name__ == "__main__":
1635
+ pass
python/res/huqie.txt ADDED
The diff for this file is too large to render. See raw diff
 
python/res/ner.json ADDED
The diff for this file is too large to render. See raw diff
 
python/res/synonym.json ADDED
The diff for this file is too large to render. See raw diff
 
python/util/__init__.py ADDED
File without changes
python/util/config.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from configparser import ConfigParser
2
+ import os,inspect
3
+
4
+ CF = ConfigParser()
5
+ __fnm = os.path.join(os.path.dirname(__file__), '../conf/sys.cnf')
6
+ if not os.path.exists(__fnm):__fnm = os.path.join(os.path.dirname(__file__), '../../conf/sys.cnf')
7
+ assert os.path.exists(__fnm), f"【EXCEPTION】can't find {__fnm}." + os.path.dirname(__file__)
8
+ if not os.path.exists(__fnm): __fnm = "./sys.cnf"
9
+
10
+ CF.read(__fnm)
11
+
12
+
13
+ class Config:
14
+ def __init__(self, env):
15
+ self.env = env
16
+ if env == "spark":CF.read("./cv.cnf")
17
+
18
+ def get(self, key):
19
+ global CF
20
+ return CF.get(self.env, key)
21
+
22
+ def init(env):
23
+ return Config(env)
24
+
python/util/es_conn.py ADDED
@@ -0,0 +1,407 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import logging
3
+ import json
4
+ import time
5
+ import copy
6
+ import elasticsearch
7
+ from elasticsearch import Elasticsearch
8
+ from elasticsearch_dsl import UpdateByQuery, Search, Index
9
+ from util import config
10
+
11
+ print("Elasticsearch version: ", elasticsearch.__version__)
12
+
13
+
14
+ def instance(env):
15
+ CF = config.init(env)
16
+ ES_DRESS = CF.get("es").split(",")
17
+
18
+ ES = Elasticsearch(
19
+ ES_DRESS,
20
+ timeout=600
21
+ )
22
+
23
+ print("ES: ", ES_DRESS, ES.info())
24
+
25
+ return ES
26
+
27
+
28
+ class HuEs:
29
+ def __init__(self, env):
30
+ self.env = env
31
+ self.info = {}
32
+ self.config = config.init(env)
33
+ self.conn()
34
+ self.idxnm = self.config.get("idx_nm")
35
+ if not self.es.ping():
36
+ raise Exception("Can't connect to ES cluster")
37
+
38
+ def conn(self):
39
+ for _ in range(10):
40
+ try:
41
+ c = instance(self.env)
42
+ if c:
43
+ self.es = c
44
+ self.info = c.info()
45
+ logging.info("Connect to es.")
46
+ break
47
+ except Exception as e:
48
+ logging.error("Fail to connect to es: " + str(e))
49
+
50
+ def version(self):
51
+ v = self.info.get("version", {"number": "5.6"})
52
+ v = v["number"].split(".")[0]
53
+ return int(v) >= 7
54
+
55
+ def upsert(self, df, idxnm=""):
56
+ res = []
57
+ for d in df:
58
+ id = d["id"]
59
+ del d["id"]
60
+ d = {"doc": d, "doc_as_upsert": "true"}
61
+ T = False
62
+ for _ in range(10):
63
+ try:
64
+ if not self.version():
65
+ r = self.es.update(
66
+ index=(
67
+ self.idxnm if not idxnm else idxnm),
68
+ body=d,
69
+ id=id,
70
+ doc_type="doc",
71
+ refresh=False,
72
+ retry_on_conflict=100)
73
+ else:
74
+ r = self.es.update(
75
+ index=(
76
+ self.idxnm if not idxnm else idxnm),
77
+ body=d,
78
+ id=id,
79
+ refresh=False,
80
+ doc_type="_doc",
81
+ retry_on_conflict=100)
82
+ logging.info("Successfully upsert: %s" % id)
83
+ T = True
84
+ break
85
+ except Exception as e:
86
+ logging.warning("Fail to index: " +
87
+ json.dumps(d, ensure_ascii=False) + str(e))
88
+ if re.search(r"(Timeout|time out)", str(e), re.IGNORECASE):
89
+ time.sleep(3)
90
+ continue
91
+ self.conn()
92
+ T = False
93
+
94
+ if not T:
95
+ res.append(d)
96
+ logging.error(
97
+ "Fail to index: " +
98
+ re.sub(
99
+ "[\r\n]",
100
+ "",
101
+ json.dumps(
102
+ d,
103
+ ensure_ascii=False)))
104
+ d["id"] = id
105
+ d["_index"] = self.idxnm
106
+
107
+ if not res:
108
+ return True
109
+ return False
110
+
111
+ def bulk(self, df, idx_nm=None):
112
+ ids, acts = {}, []
113
+ for d in df:
114
+ id = d["id"] if "id" in d else d["_id"]
115
+ ids[id] = copy.deepcopy(d)
116
+ ids[id]["_index"] = self.idxnm if not idx_nm else idx_nm
117
+ if "id" in d:
118
+ del d["id"]
119
+ if "_id" in d:
120
+ del d["_id"]
121
+ acts.append(
122
+ {"update": {"_id": id, "_index": ids[id]["_index"]}, "retry_on_conflict": 100})
123
+ acts.append({"doc": d, "doc_as_upsert": "true"})
124
+ logging.info("bulk upsert: %s" % id)
125
+
126
+ res = []
127
+ for _ in range(100):
128
+ try:
129
+ if elasticsearch.__version__[0] < 8:
130
+ r = self.es.bulk(
131
+ index=(
132
+ self.idxnm if not idx_nm else idx_nm),
133
+ body=acts,
134
+ refresh=False,
135
+ timeout="600s")
136
+ else:
137
+ r = self.es.bulk(index=(self.idxnm if not idx_nm else
138
+ idx_nm), operations=acts,
139
+ refresh=False, timeout="600s")
140
+ if re.search(r"False", str(r["errors"]), re.IGNORECASE):
141
+ return res
142
+
143
+ for it in r["items"]:
144
+ if "error" in it["update"]:
145
+ res.append(str(it["update"]["_id"]) +
146
+ ":" + str(it["update"]["error"]))
147
+
148
+ return res
149
+ except Exception as e:
150
+ logging.warn("Fail to bulk: " + str(e))
151
+ print(e)
152
+ if re.search(r"(Timeout|time out)", str(e), re.IGNORECASE):
153
+ time.sleep(3)
154
+ continue
155
+ self.conn()
156
+
157
+ return res
158
+
159
+ def bulk4script(self, df):
160
+ ids, acts = {}, []
161
+ for d in df:
162
+ id = d["id"]
163
+ ids[id] = copy.deepcopy(d["raw"])
164
+ acts.append({"update": {"_id": id, "_index": self.idxnm}})
165
+ acts.append(d["script"])
166
+ logging.info("bulk upsert: %s" % id)
167
+
168
+ res = []
169
+ for _ in range(10):
170
+ try:
171
+ if not self.version():
172
+ r = self.es.bulk(
173
+ index=self.idxnm,
174
+ body=acts,
175
+ refresh=False,
176
+ timeout="600s",
177
+ doc_type="doc")
178
+ else:
179
+ r = self.es.bulk(
180
+ index=self.idxnm,
181
+ body=acts,
182
+ refresh=False,
183
+ timeout="600s")
184
+ if re.search(r"False", str(r["errors"]), re.IGNORECASE):
185
+ return res
186
+
187
+ for it in r["items"]:
188
+ if "error" in it["update"]:
189
+ res.append(str(it["update"]["_id"]))
190
+
191
+ return res
192
+ except Exception as e:
193
+ logging.warning("Fail to bulk: " + str(e))
194
+ if re.search(r"(Timeout|time out)", str(e), re.IGNORECASE):
195
+ time.sleep(3)
196
+ continue
197
+ self.conn()
198
+
199
+ return res
200
+
201
+ def rm(self, d):
202
+ for _ in range(10):
203
+ try:
204
+ if not self.version():
205
+ r = self.es.delete(
206
+ index=self.idxnm,
207
+ id=d["id"],
208
+ doc_type="doc",
209
+ refresh=True)
210
+ else:
211
+ r = self.es.delete(
212
+ index=self.idxnm,
213
+ id=d["id"],
214
+ refresh=True,
215
+ doc_type="_doc")
216
+ logging.info("Remove %s" % d["id"])
217
+ return True
218
+ except Exception as e:
219
+ logging.warn("Fail to delete: " + str(d) + str(e))
220
+ if re.search(r"(Timeout|time out)", str(e), re.IGNORECASE):
221
+ time.sleep(3)
222
+ continue
223
+ if re.search(r"(not_found)", str(e), re.IGNORECASE):
224
+ return True
225
+ self.conn()
226
+
227
+ logging.error("Fail to delete: " + str(d))
228
+
229
+ return False
230
+
231
+ def search(self, q, idxnm=None, src=False, timeout="2s"):
232
+ print(json.dumps(q, ensure_ascii=False))
233
+ for i in range(3):
234
+ try:
235
+ res = self.es.search(index=(self.idxnm if not idxnm else idxnm),
236
+ body=q,
237
+ timeout=timeout,
238
+ # search_type="dfs_query_then_fetch",
239
+ track_total_hits=True,
240
+ _source=src)
241
+ if str(res.get("timed_out", "")).lower() == "true":
242
+ raise Exception("Es Timeout.")
243
+ return res
244
+ except Exception as e:
245
+ logging.error(
246
+ "ES search exception: " +
247
+ str(e) +
248
+ "【Q】:" +
249
+ str(q))
250
+ if str(e).find("Timeout") > 0:
251
+ continue
252
+ raise e
253
+ logging.error("ES search timeout for 3 times!")
254
+ raise Exception("ES search timeout.")
255
+
256
+ def updateByQuery(self, q, d):
257
+ ubq = UpdateByQuery(index=self.idxnm).using(self.es).query(q)
258
+ scripts = ""
259
+ for k, v in d.items():
260
+ scripts += "ctx._source.%s = params.%s;" % (str(k), str(k))
261
+ ubq = ubq.script(source=scripts, params=d)
262
+ ubq = ubq.params(refresh=False)
263
+ ubq = ubq.params(slices=5)
264
+ ubq = ubq.params(conflicts="proceed")
265
+ for i in range(3):
266
+ try:
267
+ r = ubq.execute()
268
+ return True
269
+ except Exception as e:
270
+ logging.error("ES updateByQuery exception: " +
271
+ str(e) + "【Q】:" + str(q.to_dict()))
272
+ if str(e).find("Timeout") > 0 or str(e).find("Conflict") > 0:
273
+ continue
274
+
275
+ return False
276
+
277
+ def deleteByQuery(self, query, idxnm=""):
278
+ for i in range(3):
279
+ try:
280
+ r = self.es.delete_by_query(
281
+ index=idxnm if idxnm else self.idxnm,
282
+ body=Search().query(query).to_dict())
283
+ return True
284
+ except Exception as e:
285
+ logging.error("ES updateByQuery deleteByQuery: " +
286
+ str(e) + "【Q】:" + str(query.to_dict()))
287
+ if str(e).find("Timeout") > 0 or str(e).find("Conflict") > 0:
288
+ continue
289
+
290
+ return False
291
+
292
+ def update(self, id, script, routing=None):
293
+ for i in range(3):
294
+ try:
295
+ if not self.version():
296
+ r = self.es.update(
297
+ index=self.idxnm,
298
+ id=id,
299
+ body=json.dumps(
300
+ script,
301
+ ensure_ascii=False),
302
+ doc_type="doc",
303
+ routing=routing,
304
+ refresh=False)
305
+ else:
306
+ r = self.es.update(index=self.idxnm, id=id, body=json.dumps(script, ensure_ascii=False),
307
+ routing=routing, refresh=False) # , doc_type="_doc")
308
+ return True
309
+ except Exception as e:
310
+ print(e)
311
+ logging.error("ES update exception: " + str(e) + " id:" + str(id) + ", version:" + str(self.version()) +
312
+ json.dumps(script, ensure_ascii=False))
313
+ if str(e).find("Timeout") > 0:
314
+ continue
315
+
316
+ return False
317
+
318
+ def indexExist(self, idxnm):
319
+ s = Index(idxnm if idxnm else self.idxnm, self.es)
320
+ for i in range(3):
321
+ try:
322
+ return s.exists()
323
+ except Exception as e:
324
+ logging.error("ES updateByQuery indexExist: " + str(e))
325
+ if str(e).find("Timeout") > 0 or str(e).find("Conflict") > 0:
326
+ continue
327
+
328
+ return False
329
+
330
+ def docExist(self, docid, idxnm=None):
331
+ for i in range(3):
332
+ try:
333
+ return self.es.exists(index=(idxnm if idxnm else self.idxnm),
334
+ id=docid)
335
+ except Exception as e:
336
+ logging.error("ES Doc Exist: " + str(e))
337
+ if str(e).find("Timeout") > 0 or str(e).find("Conflict") > 0:
338
+ continue
339
+ return False
340
+
341
+ def createIdx(self, idxnm, mapping):
342
+ try:
343
+ if elasticsearch.__version__[0] < 8:
344
+ return self.es.indices.create(idxnm, body=mapping)
345
+ from elasticsearch.client import IndicesClient
346
+ return IndicesClient(self.es).create(index=idxnm,
347
+ settings=mapping["settings"],
348
+ mappings=mapping["mappings"])
349
+ except Exception as e:
350
+ logging.error("ES create index error %s ----%s" % (idxnm, str(e)))
351
+
352
+ def deleteIdx(self, idxnm):
353
+ try:
354
+ return self.es.indices.delete(idxnm, allow_no_indices=True)
355
+ except Exception as e:
356
+ logging.error("ES delete index error %s ----%s" % (idxnm, str(e)))
357
+
358
+ def getTotal(self, res):
359
+ if isinstance(res["hits"]["total"], type({})):
360
+ return res["hits"]["total"]["value"]
361
+ return res["hits"]["total"]
362
+
363
+ def getDocIds(self, res):
364
+ return [d["_id"] for d in res["hits"]["hits"]]
365
+
366
+ def getSource(self, res):
367
+ rr = []
368
+ for d in res["hits"]["hits"]:
369
+ d["_source"]["id"] = d["_id"]
370
+ d["_source"]["_score"] = d["_score"]
371
+ rr.append(d["_source"])
372
+ return rr
373
+
374
+ def scrollIter(self, pagesize=100, scroll_time='2m', q={
375
+ "query": {"match_all": {}}, "sort": [{"updated_at": {"order": "desc"}}]}):
376
+ for _ in range(100):
377
+ try:
378
+ page = self.es.search(
379
+ index=self.idxnm,
380
+ scroll=scroll_time,
381
+ size=pagesize,
382
+ body=q,
383
+ _source=None
384
+ )
385
+ break
386
+ except Exception as e:
387
+ logging.error("ES scrolling fail. " + str(e))
388
+ time.sleep(3)
389
+
390
+ sid = page['_scroll_id']
391
+ scroll_size = page['hits']['total']["value"]
392
+ logging.info("[TOTAL]%d" % scroll_size)
393
+ # Start scrolling
394
+ while scroll_size > 0:
395
+ yield page["hits"]["hits"]
396
+ for _ in range(100):
397
+ try:
398
+ page = self.es.scroll(scroll_id=sid, scroll=scroll_time)
399
+ break
400
+ except Exception as e:
401
+ logging.error("ES scrolling fail. " + str(e))
402
+ time.sleep(3)
403
+
404
+ # Update the scroll ID
405
+ sid = page['_scroll_id']
406
+ # Get the number of results that we returned in the last scroll
407
+ scroll_size = len(page['hits']['hits'])
python/util/setup_logging.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import logging.config
3
+ import os
4
+
5
+
6
+ def log_dir():
7
+ fnm = os.path.join(os.path.dirname(__file__), '../log/')
8
+ if not os.path.exists(fnm):
9
+ fnm = os.path.join(os.path.dirname(__file__), '../../log/')
10
+ assert os.path.exists(fnm), f"Can't locate log dir: {fnm}"
11
+ return fnm
12
+
13
+
14
+ def setup_logging(default_path="conf/logging.json",
15
+ default_level=logging.INFO,
16
+ env_key="LOG_CFG"):
17
+ path = default_path
18
+ value = os.getenv(env_key, None)
19
+ if value:
20
+ path = value
21
+ if os.path.exists(path):
22
+ with open(path, "r") as f:
23
+ config = json.load(f)
24
+ fnm = log_dir()
25
+
26
+ config["handlers"]["info_file_handler"]["filename"] = fnm + "info.log"
27
+ config["handlers"]["error_file_handler"]["filename"] = fnm + "error.log"
28
+ logging.config.dictConfig(config)
29
+ else:
30
+ logging.basicConfig(level=default_level)
31
+
32
+
33
+ __fnm = os.path.join(os.path.dirname(__file__), 'conf/logging.json')
34
+ if not os.path.exists(__fnm):
35
+ __fnm = os.path.join(os.path.dirname(__file__), '../../conf/logging.json')
36
+ setup_logging(__fnm)