first
Browse files- Roboto-Regular.ttf +0 -0
- app.py +80 -0
- packages.txt +1 -0
- requirements.txt +4 -0
Roboto-Regular.ttf
ADDED
Binary file (168 kB). View file
|
|
app.py
ADDED
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pathlib
|
2 |
+
import streamlit as st
|
3 |
+
from PIL import Image, ImageDraw, ImageFont
|
4 |
+
from streamlit_image_coordinates import streamlit_image_coordinates
|
5 |
+
import pytesseract
|
6 |
+
import pypdfium2
|
7 |
+
|
8 |
+
from streamlit.runtime.uploaded_file_manager import UploadedFile
|
9 |
+
|
10 |
+
|
11 |
+
LANG = "eng+por"
|
12 |
+
OPTIONS = ""
|
13 |
+
FONT_FILE = pathlib.Path(__file__).parent / "Roboto-Regular.ttf"
|
14 |
+
|
15 |
+
st.set_page_config(
|
16 |
+
page_title="Streamlit Demo",
|
17 |
+
layout="wide",
|
18 |
+
page_icon="🪟",
|
19 |
+
)
|
20 |
+
|
21 |
+
"# Streamlit PDF OCR Demo"
|
22 |
+
|
23 |
+
@st.cache_data(hash_funcs={"file": lambda uploaded_file: hash(uploaded_file.file_id)})
|
24 |
+
def convert_pdf_to_img(file: UploadedFile) -> Image.Image:
|
25 |
+
doc = pypdfium2.PdfDocument(file)
|
26 |
+
raw_image: pypdfium2.PdfBitmap = doc[0].render(scale=2, grayscale=True)
|
27 |
+
image: Image.Image = raw_image.to_pil()
|
28 |
+
return image
|
29 |
+
|
30 |
+
|
31 |
+
@st.cache_data
|
32 |
+
def load_image_and_ocr(img: Image.Image, lang: str, config: str) -> tuple[Image.Image, dict, list[dict]]:
|
33 |
+
data = pytesseract.image_to_data(img, lang, config, output_type=pytesseract.Output.DICT)
|
34 |
+
_boxes = [{"text": text} for text in data["text"]]
|
35 |
+
for key in ("left", "top", "width", "height"):
|
36 |
+
for i, val in enumerate(data[key]):
|
37 |
+
_boxes[i][key] = val
|
38 |
+
boxes = [box for box in _boxes if box['text'].strip() != '']
|
39 |
+
return img.convert("RGB"), data, boxes
|
40 |
+
|
41 |
+
|
42 |
+
def main(image: Image.Image):
|
43 |
+
img, data, boxes = load_image_and_ocr(image, LANG, OPTIONS)
|
44 |
+
|
45 |
+
if (coords := st.session_state.get("image_coords")) is None:
|
46 |
+
coords = {"x": 0, "y": 0}
|
47 |
+
|
48 |
+
draw = ImageDraw.Draw(img)
|
49 |
+
text = None
|
50 |
+
for box in boxes:
|
51 |
+
MARGIN = 5
|
52 |
+
x1, y1, width, height = (box['left'], box['top'], box['width'], box['height'])
|
53 |
+
x2, y2 = x1+width, y1+height
|
54 |
+
x1, y1, x2, y2 = (x1-MARGIN, y1-MARGIN, x2+MARGIN, y2+MARGIN)
|
55 |
+
|
56 |
+
if (x1 <= coords["x"] <= x2) and (y1 <= coords["y"] <= y2):
|
57 |
+
color = "blue"
|
58 |
+
font = ImageFont.FreeTypeFont(FONT_FILE, size=24)
|
59 |
+
text = box['text']
|
60 |
+
# draw.text((0, 15), text, fill="black", font=font)
|
61 |
+
draw.text(((x1+x2)/2, y1-15), text, align="center", anchor="mm", fill="red", font=font)
|
62 |
+
else:
|
63 |
+
color = "green"
|
64 |
+
|
65 |
+
draw.rectangle((x1, y1, x2, y2), fill=None, outline=color, width=2)
|
66 |
+
|
67 |
+
"## Click at a rectangle"
|
68 |
+
if text:
|
69 |
+
f'''### Selected text: "{text}"'''
|
70 |
+
else:
|
71 |
+
'''### Selected text will appear here'''
|
72 |
+
|
73 |
+
# Another option would be `click_and_drag=True` with single-word orientation mode
|
74 |
+
streamlit_image_coordinates(img, key="image_coords", click_and_drag=False)
|
75 |
+
|
76 |
+
st.file_uploader("Upload PDF", ".pdf", key="pdf")
|
77 |
+
|
78 |
+
if (pdf_file := st.session_state.get("pdf")) is not None:
|
79 |
+
base_image = convert_pdf_to_img(pdf_file)
|
80 |
+
main(base_image)
|
packages.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
tesseract-ocr-all
|
requirements.txt
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
pillow
|
2 |
+
pypdfium2
|
3 |
+
pytesseract
|
4 |
+
git+https://github.com/blackary/streamlit-image-coordinates@8174f1fae8cda92d97f64c212e3ee455165e4deb
|