Spaces:

etrotta
/

pdf-ocr

Sleeping

App Files Files Community

etrotta commited on Aug 9, 2024

Commit

91acf95

1 Parent(s): 6e2d071

first

Browse files

Files changed (4) hide show

Roboto-Regular.ttf +0 -0
app.py +80 -0
packages.txt +1 -0
requirements.txt +4 -0

Roboto-Regular.ttf ADDED Viewed

Binary file (168 kB). View file

app.py ADDED Viewed

	@@ -0,0 +1,80 @@

+import pathlib
+import streamlit as st
+from PIL import Image, ImageDraw, ImageFont
+from streamlit_image_coordinates import streamlit_image_coordinates
+import pytesseract
+import pypdfium2
+from streamlit.runtime.uploaded_file_manager import UploadedFile
+LANG = "eng+por"
+OPTIONS = ""
+FONT_FILE = pathlib.Path(__file__).parent / "Roboto-Regular.ttf"
+st.set_page_config(
+    page_title="Streamlit Demo",
+    layout="wide",
+    page_icon="🪟",
+)
+"# Streamlit PDF OCR Demo"
+@st.cache_data(hash_funcs={"file": lambda uploaded_file: hash(uploaded_file.file_id)})
+def convert_pdf_to_img(file: UploadedFile) -> Image.Image:
+    doc = pypdfium2.PdfDocument(file)
+    raw_image: pypdfium2.PdfBitmap = doc[0].render(scale=2, grayscale=True)
+    image: Image.Image = raw_image.to_pil()
+    return image
+@st.cache_data
+def load_image_and_ocr(img: Image.Image, lang: str, config: str) -> tuple[Image.Image, dict, list[dict]]:
+    data = pytesseract.image_to_data(img, lang, config, output_type=pytesseract.Output.DICT)
+    _boxes = [{"text": text} for text in data["text"]]
+    for key in ("left", "top", "width", "height"):
+        for i, val in enumerate(data[key]):
+            _boxes[i][key] = val
+    boxes = [box for box in _boxes if box['text'].strip() != '']
+    return img.convert("RGB"), data, boxes
+def main(image: Image.Image):
+    img, data, boxes = load_image_and_ocr(image, LANG, OPTIONS)
+    if (coords := st.session_state.get("image_coords")) is None:
+        coords = {"x": 0, "y": 0}
+    draw = ImageDraw.Draw(img)
+    text = None
+    for box in boxes:
+        MARGIN = 5
+        x1, y1, width, height = (box['left'], box['top'], box['width'], box['height'])
+        x2, y2 = x1+width, y1+height
+        x1, y1, x2, y2 = (x1-MARGIN, y1-MARGIN, x2+MARGIN, y2+MARGIN)
+        if (x1 <= coords["x"] <= x2) and (y1 <= coords["y"] <= y2):
+            color = "blue"
+            font = ImageFont.FreeTypeFont(FONT_FILE, size=24)
+            text = box['text']
+            # draw.text((0, 15), text, fill="black", font=font)
+            draw.text(((x1+x2)/2, y1-15), text, align="center", anchor="mm", fill="red", font=font)
+        else:
+            color = "green"
+        draw.rectangle((x1, y1, x2, y2), fill=None, outline=color, width=2)
+    "## Click at a rectangle"
+    if text:
+        f'''### Selected text: "{text}"'''
+    else:
+        '''### Selected text will appear here'''
+    # Another option would be `click_and_drag=True` with single-word orientation mode
+    streamlit_image_coordinates(img, key="image_coords", click_and_drag=False)
+st.file_uploader("Upload PDF", ".pdf", key="pdf")
+if (pdf_file := st.session_state.get("pdf")) is not None:
+    base_image = convert_pdf_to_img(pdf_file)
+    main(base_image)

packages.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ tesseract-ocr-all

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+pillow
+pypdfium2
+pytesseract
+git+https://github.com/blackary/streamlit-image-coordinates@8174f1fae8cda92d97f64c212e3ee455165e4deb