Spaces:

ITNovaML
/

sparrow-data-itn

Runtime error

App Files Files Community

ITNovaML

katanaml commited on Jul 14, 2023

Commit

f25b29f

0 Parent(s):

Duplicate from katanaml-org/sparrow-data

Browse files

Co-authored-by: Andrej Baranovskij <[email protected]>

Files changed (16) hide show

.gitattributes +34 -0
.gitignore +2 -0
Dockerfile +26 -0
README.md +12 -0
__init__.py +0 -0
config.py +13 -0
data/ocr_stats.json +1 -0
data/result.json +53 -0
endpoints.py +24 -0
requirements-fastapi.txt +12 -0
routers/__init__.py +0 -0
routers/chatgpt_plugin.py +139 -0
routers/data_utils.py +227 -0
routers/dataset.py +59 -0
routers/ocr.py +187 -0
utils.py +29 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,34 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+
2	+ .DS_Store

Dockerfile ADDED Viewed

	@@ -0,0 +1,26 @@

+FROM python:3.7
+WORKDIR /code
+COPY requirements-fastapi.txt ./
+# Installing libGL
+RUN apt-get update && apt-get install -y \
+    libgl1-mesa-dev
+RUN apt-get install -y poppler-utils libpoppler-cpp-dev
+RUN pip install --no-cache-dir --upgrade -r /code/requirements-fastapi.txt
+RUN useradd -m -u 1000 user
+USER user
+ENV HOME=/home/user \
+	PATH=/home/user/.local/bin:$PATH
+WORKDIR $HOME/app
+COPY --chown=user . $HOME/app/
+CMD ["uvicorn", "endpoints:app", "--host", "0.0.0.0", "--port", "7860"]

README.md ADDED Viewed

	@@ -0,0 +1,12 @@

+---
+title: Sparrow Data
+emoji: 🏃
+colorFrom: pink
+colorTo: gray
+sdk: docker
+pinned: false
+license: mit
+duplicated_from: katanaml-org/sparrow-data
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

__init__.py ADDED Viewed

File without changes

config.py ADDED Viewed

	@@ -0,0 +1,13 @@

+from pydantic import BaseSettings
+import os
+class Settings(BaseSettings):
+    huggingface_key: str = os.environ.get("huggingface_key")
+    sparrow_key: str = os.environ.get("sparrow_key")
+    secure_key: str = os.environ.get("secure_key")
+    dataset_name: str = "katanaml-org/invoices-donut-data-v1"
+    ocr_stats_file: str = "data/ocr_stats.json"
+settings = Settings()

data/ocr_stats.json ADDED Viewed

	@@ -0,0 +1 @@

+ [[0.0, "receipt_00001.png", "2023-05-23 10:55:43"], [19.22510600090027, "receipt_00001.png", "2023-05-23 11:11:02"], [4.0531158447265625e-06, "receipt_00001.png", "2023-05-23 11:11:10"], [3.0994415283203125e-06, "receipt_00001.png", "2023-05-23 11:11:11"], [3.0994415283203125e-06, "receipt_00001.png", "2023-05-23 11:11:12"], [2.86102294921875e-06, "receipt_00001.png", "2023-05-23 11:11:13"], [3.0994415283203125e-06, "receipt_00001.png", "2023-05-23 11:11:13"], [3.653481960296631, "receipt_00001.png", "2023-05-23 11:32:48"], [8.929341077804565, "receipt_00001.png", "2023-05-23 11:34:52"], [3.5088820457458496, "receipt_00001.png", "2023-05-23 16:32:17"], [2.863774061203003, "receipt_00001.png", "2023-05-23 16:32:56"], [4.174198150634766, "inout-20211211_001.jpg", "2023-05-23 16:38:33"], [4.616858243942261, "inout-20211211_001.jpg", "2023-05-23 16:39:28"], [4.6479880809783936, "inout-20211211_001.jpg", "2023-05-23 16:47:27"], [4.756654262542725, "inout-20211211_001.jpg", "2023-05-23 22:07:14"], [5.704661130905151, "wholefoods-20211211_005.jpg", "2023-05-23 22:16:38"], [6.363792896270752, "wholefoods-20211211_005.jpg", "2023-05-23 22:24:13"], [6.582294940948486, "cvs-20211211_009.jpg", "2023-05-23 22:43:41"], [8.032721757888794, "oldnavy-20211211_015.jpg", "2023-05-23 22:45:58"], [6.35598611831665, "ross-20211211_010.jpg", "2023-05-23 22:47:50"], [7.241703987121582, "ross-20211211_010.jpg", "2023-05-24 11:03:57"], [6.259234189987183, "ross-20211211_010.pdf", "2023-05-24 11:04:26"], [7.275213956832886, "ross-20211211_010.pdf", "2023-05-24 11:05:25"], [5.848371982574463, "invoice_10.jpg", "2023-05-24 11:06:21"], [6.028747081756592, "invoice_10.jpg", "2023-05-24 11:12:14"], [6.5253260135650635, "cvs-20211211_009.jpg", "2023-05-24 11:29:53"], [6.507750988006592, "cvs-20211211_009.jpg", "2023-05-24 12:55:14"], [2.864002227783203, "receipt_00001.png", "2023-05-24 12:55:30"], [2.9030818939208984, "receipt_00001.png", "2023-05-24 12:55:40"], [5.672614097595215, "wholefoods-20211211_005.jpg", "2023-05-24 12:56:13"], [5.712976932525635, "wholefoods-20211211_005.pdf", "2023-05-24 12:56:29"], [5.984729051589966, "invoice_10.jpg", "2023-05-24 13:00:23"], [7.3337507247924805, "bestbuy-20211211_006.pdf", "2023-05-24 13:01:13"], [4.676954984664917, "inout-20211211_001.jpg", "2023-05-24 21:09:53"], [3.9793169498443604, "inout-20211211_001.jpg", "2023-05-24 22:01:12"], [4.716302871704102, "inout-20211211_001.jpg", "2023-05-24 22:07:19"], [4.611649990081787, "inout-20211211_001.jpg", "2023-05-24 22:11:00"], [5.18176007270813, "inout-20211211_001.jpg", "2023-05-24 22:12:26"], [4.76771092414856, "inout-20211211_001.jpg", "2023-05-25 10:00:11"], [4.62838888168335, "inout-20211211_001.jpg", "2023-05-25 10:12:36"], [4.6390650272369385, "inout-20211211_001.jpg", "2023-05-25 10:35:31"], [4.605455160140991, "inout-20211211_001.jpg", "2023-05-25 10:36:59"], [4.541555881500244, "inout-20211211_001.jpg", "2023-05-25 10:37:41"], [4.652244806289673, "inout-20211211_001.jpg", "2023-05-25 10:38:09"], [3.947613000869751, "inout-20211211_001.jpg", "2023-05-25 10:58:54"], [4.597126245498657, "inout-20211211_001.jpg", "2023-05-25 11:00:03"], [4.6871421337127686, "inout-20211211_001.jpg", "2023-05-25 11:02:44"], [4.579195976257324, "inout-20211211_001.jpg", "2023-05-25 11:32:11"], [4.734511137008667, "inout-20211211_001.jpg", "2023-05-25 11:33:03"], [4.602473258972168, "inout-20211211_001.jpg", "2023-05-25 11:44:11"], [4.563000202178955, "inout-20211211_001.jpg", "2023-05-25 11:47:35"], [4.576035022735596, "inout-20211211_001.jpg", "2023-05-25 11:49:55"], [4.860241889953613, "inout-20211211_001.jpg", "2023-05-25 11:53:19"], [4.693282127380371, "inout-20211211_001.jpg", "2023-05-25 11:56:00"], [4.5564610958099365, "inout-20211211_001.jpg", "2023-05-25 16:02:52"], [5.022596836090088, "inout-20211211_001.jpg", "2023-05-25 16:03:47"], [4.650119781494141, "inout-20211211_001.jpg", "2023-05-25 16:27:35"], [6.16159200668335, "inout-20211211_001.jpg", "2023-05-30 22:15:29"], [9.421452045440674, "../docs/models/donut/data/img/test/invoice_2.jpg", "2023-06-07 21:03:34"]]

data/result.json ADDED Viewed

	@@ -0,0 +1,53 @@

+['YOUR GUEST NUMBER IS']
+['43']
+['IN-N-OUT BURGER LINQ']
+['320 6 2166 6301']
+['Cashier: SERJI0 SA']
+['Check : 43']
+['TRANS #: 6301']
+['1 Db1-Dbl']
+['5.25']
+['+ Onion']
+['1 Fry']
+['2.35']
+['1 Med Soft Drink']
+['2.15']
+['COUNTER-Eat In']
+['9.75']
+['TAX 8.375%']
+['.82']
+['Amount Due']
+['$10.57']
+['Tender MasterCard']
+['$10.57']
+['Change']
+['$.00']
+['CHARGE DETAIL']
+['SALE']
+['Card Type:']
+['Mastercard']
+['Account :']
+['************5562 R']
+['Auth Code:']
+['NDTQU8']
+['Trans #:']
+['6301']
+['Auth Ref :']
+['2015517078']
+['AUTH AMT :']
+['$10.57']
+['AID:']
+['A0000000041010']
+['TVR :']
+['0000008001']
+['TSI:']
+['0000']
+['App Name:']
+['Debit MasterCard']
+['ARQC:']
+['ADCF5208793B7BD6']
+['THANK YOU!']
+['Quest ions/Comments: Cal1 800-786-1 :10']
+['L1 T6']
+['9:21 PM']
+['2021-11-30']

endpoints.py ADDED Viewed

	@@ -0,0 +1,24 @@

+from fastapi import FastAPI
+from fastapi.middleware.cors import CORSMiddleware
+from routers import dataset
+from routers import ocr
+from routers import chatgpt_plugin
+app = FastAPI(openapi_url="/api/v1/sparrow-data/openapi.json", docs_url="/api/v1/sparrow-data/docs")
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_methods=["*"],
+    allow_headers=["*"],
+    allow_credentials=True,
+)
+app.include_router(dataset.router, prefix="/api-dataset/v1/sparrow-data", tags=["Dataset"])
+app.include_router(ocr.router, prefix="/api-ocr/v1/sparrow-data", tags=["OCR"])
+app.include_router(chatgpt_plugin.router, prefix="/api-chatgpt-plugin/v1/sparrow-data", tags=["ChatGPT Plugin"])
+@app.get("/")
+async def root():
+    return {"message": "Sparrow Data API"}

requirements-fastapi.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+pdf2image==1.16.2
+python-poppler==0.4.1
+datasets==2.10.1
+Pillow==9.5.0
+paddlepaddle==2.4.2
+paddleocr==2.6.1.3
+fastapi==0.96.0
+python-multipart
+motor==3.1.2
+pydantic==1.10.8
+pycryptodome==3.18.0
+uvicorn[standard]

routers/__init__.py ADDED Viewed

File without changes

routers/chatgpt_plugin.py ADDED Viewed

	@@ -0,0 +1,139 @@

+from fastapi import APIRouter, HTTPException, Response, Form
+from config import settings
+import os
+import motor.motor_asyncio
+from routers.data_utils import get_receipt_data
+from routers.data_utils import store_receipt_db_data
+from routers.data_utils import get_receipt_db_data
+from routers.data_utils import delete_receipt_db_data
+from routers.data_utils import get_user_receipt_db_ids
+from routers.data_utils import get_user_receipt_content_db
+from pymongo.errors import PyMongoError
+import json
+router = APIRouter()
+client = None
+db = None
+@router.on_event("startup")
+async def startup_event():
+    if "MONGODB_URL" in os.environ:
+        global client
+        global db
+        client = motor.motor_asyncio.AsyncIOMotorClient(os.environ["MONGODB_URL"])
+        db = client.chatgpt_plugin
+        print("Connected to MongoDB from ChatGPT plugin!")
+@router.on_event("shutdown")
+async def shutdown_event():
+    if "MONGODB_URL" in os.environ:
+        global client
+        client.close()
+@router.get("/receipt_by_id")
+async def get_receipt_by_id(receipt_id: str, sparrow_key: str):
+    if sparrow_key != settings.sparrow_key:
+        return {"error": "Invalid Sparrow key."}
+    if "MONGODB_URL" in os.environ:
+        result = await get_receipt_data(receipt_id, db)
+        if result is None:
+            raise HTTPException(status_code=404, detail=f"Receipt {receipt_id} not found")
+        return result
+    return HTTPException(status_code=400, detail=f"No MongoDB URL provided.")
+@router.post("/store_receipt_db")
+async def run_store_receipt_db(chatgpt_user: str = Form(None), receipt_id: str = Form(None),
+                            receipt_content: str = Form(None), sparrow_key: str = Form(None)):
+    if sparrow_key != settings.sparrow_key:
+        return {"error": "Invalid Sparrow key."}
+    print(f"Storing receipt {receipt_id} for user {chatgpt_user}...")
+    if "MONGODB_URL" in os.environ:
+        try:
+            json.loads(receipt_content)
+        except json.decoder.JSONDecodeError:
+            return HTTPException(status_code=400, detail=f"Receipt content is not valid JSON.")
+        try:
+            result = await store_receipt_db_data(chatgpt_user, receipt_id, receipt_content, db)
+        except PyMongoError:
+            return HTTPException(status_code=400, detail=f"Saving data failed.")
+        if result is not None:
+            return Response(status_code=200)
+    return HTTPException(status_code=400, detail=f"No MongoDB URL provided.")
+@router.get("/receipt_db_by_id")
+async def get_receipt_db_by_id(chatgpt_user: str, receipt_id: str, sparrow_key: str):
+    if sparrow_key != settings.sparrow_key:
+        return {"error": "Invalid Sparrow key."}
+    if "MONGODB_URL" in os.environ:
+        result = await get_receipt_db_data(chatgpt_user, receipt_id, db)
+        if result is None:
+            raise HTTPException(status_code=404, detail=f"Receipt {receipt_id} not found")
+        return json.loads(result)
+    return HTTPException(status_code=400, detail=f"No MongoDB URL provided.")
+@router.delete("/receipt_db_by_id")
+async def delete_receipt_db_by_id(chatgpt_user: str, receipt_id: str, sparrow_key: str):
+    if sparrow_key != settings.sparrow_key:
+        return {"error": "Invalid Sparrow key."}
+    if "MONGODB_URL" in os.environ:
+        result = await delete_receipt_db_data(chatgpt_user, receipt_id, db)
+        if result.deleted_count == 0:
+            raise HTTPException(status_code=404, detail=f"Receipt {receipt_id} not found")
+        return Response(status_code=200)
+    return HTTPException(status_code=400, detail=f"No MongoDB URL provided.")
+@router.get("/receipt_db_ids_by_user")
+async def get_receipt_db_ids_by_user(chatgpt_user: str, sparrow_key: str):
+    if sparrow_key != settings.sparrow_key:
+        return {"error": "Invalid Sparrow key."}
+    if "MONGODB_URL" in os.environ:
+        result = await get_user_receipt_db_ids(chatgpt_user, db)
+        if result is None:
+            raise HTTPException(status_code=404, detail=f"User {chatgpt_user} not found")
+        return result
+    return HTTPException(status_code=400, detail=f"No MongoDB URL provided.")
+@router.get("/receipt_db_content_by_user")
+async def get_receipt_db_content_by_user(chatgpt_user: str, sparrow_key: str):
+    if sparrow_key != settings.sparrow_key:
+        return {"error": "Invalid Sparrow key."}
+    if "MONGODB_URL" in os.environ:
+        result = await get_user_receipt_content_db(chatgpt_user, db)
+        return result
+    return HTTPException(status_code=400, detail=f"No MongoDB URL provided.")

routers/data_utils.py ADDED Viewed

	@@ -0,0 +1,227 @@

+import secrets
+import string
+from bson import ObjectId
+from pydantic import BaseModel, Field, ValidationError
+from typing import List
+import datetime
+from Crypto.Cipher import AES
+from Crypto.Util.Padding import pad, unpad
+from base64 import b64encode, b64decode
+import base64
+from pymongo.errors import DuplicateKeyError
+from pymongo.errors import PyMongoError
+import json
+from config import settings
+# Define a key. Note: it must be of length 16, 24, or 32.
+secure_key = settings.secure_key
+def encrypt(plain_text: str, key: bytes) -> str:
+    cipher = AES.new(key, AES.MODE_CBC)
+    iv = cipher.iv
+    encrypted_text = cipher.encrypt(pad(plain_text.encode(), AES.block_size))
+    return b64encode(iv + encrypted_text).decode()
+def decrypt(encrypted_text: str, key: bytes) -> str:
+    decrypted_text = b64decode(encrypted_text)
+    iv = decrypted_text[:16]
+    cipher = AES.new(key, AES.MODE_CBC, iv=iv)
+    decrypted_text = unpad(cipher.decrypt(decrypted_text[16:]), AES.block_size)
+    return decrypted_text.decode()
+class PyObjectId(ObjectId):
+    @classmethod
+    def __get_validators__(cls):
+        yield cls.validate
+    @classmethod
+    def validate(cls, v):
+        if not ObjectId.is_valid(v):
+            raise ValueError("Invalid objectid")
+        return ObjectId(v)
+    @classmethod
+    def __modify_schema__(cls, field_schema):
+        field_schema.update(type="string")
+class ReceiptModel(BaseModel):
+    id: PyObjectId = Field(default_factory=PyObjectId, alias="_id")
+    receipt_key: str = Field(..., description="The unique key for the receipt.")
+    content: List[List[str]] = Field(..., description="An array of single-element arrays, each containing receipt entry.")
+    class Config:
+        allow_population_by_field_name = True
+        arbitrary_types_allowed = True
+        json_encoders = {ObjectId: str}
+        schema_extra = {
+            'example': {
+                'receipt_key': 'RzSZ0BTnuG',
+                'content': [['YOUR GUEST NUMBER IS'], ['43'], ['IN-N-OUT BURGER LINQ']]
+            },
+            'title': 'ReceiptModel',
+            'description': 'A model representing a receipt with a key and its contents.',
+        }
+class ReceiptDBModel(BaseModel):
+    id: PyObjectId = Field(default_factory=PyObjectId, alias="_id")
+    user: str = Field(..., description="The user who uploaded the receipt.")
+    receipt_key: str = Field(..., description="The unique key for the receipt.")
+    content: str = Field(..., description="A string representing DB receipt data.")
+    class Config:
+        allow_population_by_field_name = True
+        arbitrary_types_allowed = True
+        json_encoders = {ObjectId: str}
+        schema_extra = {
+            'example': {
+                'user': 'user1',
+                'receipt_key': 'RzSZ0BTnuG',
+                'content': '{"store": "CVS Pharmacy", "location": "3300 S LAS VEGAS BLVD, LAS VEGAS, NV, 89109"}'
+            },
+            'title': 'ReceiptProcessedModel',
+            'description': 'A model representing a receipt DB contents.',
+        }
+def merge_data(values):
+    data = []
+    for idx in range(len(values)):
+        data.append([values[idx][1][0]])
+        # print(data[idx])
+    return data
+async def store_data(data, db):
+    print("Storing data...")
+    key = generate_key()
+    try:
+        receipt = ReceiptModel(receipt_key=key, content=data)
+    except ValidationError as e:
+        print(f"An error occurred: {e}")
+    else:
+        # Convert the Pydantic model instance into a dictionary
+        receipt_dict = receipt.dict()
+        receipt_dict["content"] = encrypt(str(receipt_dict["content"]), base64.b64decode(secure_key))
+        receipt_dict["created_at"] = datetime.datetime.utcnow()
+        # Insert the dictionary into MongoDB
+        try:
+            result = await db["uploads"].insert_one(receipt_dict)
+        except DuplicateKeyError:
+            raise
+        print(f"Inserted document with id: {result.inserted_id}")
+        return key
+    return None
+async def get_receipt_data(key, db):
+    print(f"Getting receipt data for key: {key}")
+    receipt = await db["uploads"].find_one({"receipt_key": key})
+    if receipt is not None:
+        await db["uploads"].delete_one({"receipt_key": key})
+        receipt['content'] = decrypt(receipt['content'], base64.b64decode(secure_key))
+        return receipt['content']
+    return None
+async def store_receipt_db_data(chatgpt_user, receipt_id, receipt_content, db):
+    print("Storing receipt data...")
+    try:
+        receipt = ReceiptDBModel(user=chatgpt_user, receipt_key=receipt_id, content=receipt_content)
+    except ValidationError as e:
+        print(f"An error occurred: {e}")
+    else:
+        # Convert the Pydantic model instance into a dictionary
+        receipt_dict = receipt.dict()
+        receipt_dict["content"] = encrypt(str(receipt_dict["content"]), base64.b64decode(secure_key))
+        # Insert the dictionary into MongoDB
+        try:
+            query = {"user": chatgpt_user, "receipt_key": receipt_id}
+            new_data = {"$set": {"content": receipt_dict["content"]}}
+            result = await db["receipts"].update_one(query, new_data, upsert=True)
+        except PyMongoError:
+            raise
+        print(f"Inserted document with id: {result}")
+        return result
+    return None
+async def get_receipt_db_data(chatgpt_user, receipt_id, db):
+    print(f"Getting receipt data for key: {receipt_id}")
+    receipt = await db["receipts"].find_one({"user": chatgpt_user, "receipt_key": receipt_id})
+    if receipt is not None:
+        receipt['content'] = decrypt(receipt['content'], base64.b64decode(secure_key))
+        return receipt['content']
+    return None
+async def get_user_receipt_db_ids(chatgpt_user, db):
+    print(f"Getting user receipts ids for user: {chatgpt_user}")
+    receipts_processed = await db["receipts"].find({"user": chatgpt_user}).to_list(length=100)
+    receipts = []
+    if receipts_processed is not None:
+        for receipt in receipts_processed:
+            receipts.append(receipt['receipt_key'])
+    return receipts
+async def delete_receipt_db_data(chatgpt_user, receipt_id, db):
+    print(f"Deleting receipt data for key: {receipt_id}")
+    result = await db["receipts"].delete_one({"user": chatgpt_user, "receipt_key": receipt_id})
+    if result.deleted_count == 0:
+        print(f"Receipt with id: {receipt_id} not found")
+    else:
+        print(f"Deleted document with id: {result}")
+    return result
+async def get_user_receipt_content_db(chatgpt_user, db):
+    print(f"Getting user receipts fields for user: {chatgpt_user}")
+    receipts_processed = await db["receipts"].find({"user": chatgpt_user}).to_list(length=100)
+    receipts = []
+    if receipts_processed is not None:
+        for receipt in receipts_processed:
+            receipt['content'] = decrypt(receipt['content'], base64.b64decode(secure_key))
+            receipts.append(json.loads(receipt['content']))
+    return receipts
+def generate_key(length=10):
+    alphabet = string.ascii_letters + string.digits
+    key = ''.join(secrets.choice(alphabet) for i in range(length))
+    return key

routers/dataset.py ADDED Viewed

	@@ -0,0 +1,59 @@

+from fastapi import APIRouter
+from datasets import load_dataset
+from ast import literal_eval
+from pydantic import BaseModel
+from typing import Dict
+from io import BytesIO
+from PIL import Image
+import base64
+from config import settings
+from huggingface_hub import login
+router = APIRouter()
+login(settings.huggingface_key)
+class ImageResponse(BaseModel):
+    image_data: str
+    ground_truth_data: Dict
+def encode_pil_image(image: Image) -> str:
+    buffer = BytesIO()
+    image.save(buffer, format='JPEG')
+    img_data = buffer.getvalue()
+    return base64.b64encode(img_data).decode('utf-8')
+@router.get("/dataset_info")
+async def get_dataset_info():
+    dataset = load_dataset(settings.dataset_name)
+    splits = []
+    for split in dataset.keys():
+        split = {
+            "name": split,
+            "number_of_rows": len(dataset[split])
+        }
+        splits.append(split)
+    result = {
+        "dataset": settings.dataset_name,
+        "splits": splits
+    }
+    return result
+@router.get("/ground_truth", response_model=ImageResponse)
+async def get_ground_truth() -> ImageResponse:
+    dataset = load_dataset(settings.dataset_name)
+    example = dataset['test'][0]
+    image = example['image']
+    encoded_img = encode_pil_image(image)
+    ground_truth = example['ground_truth']
+    data = literal_eval(ground_truth)['gt_parse']
+    return ImageResponse(image_data=encoded_img, ground_truth_data=data)

routers/ocr.py ADDED Viewed

	@@ -0,0 +1,187 @@

+from fastapi import APIRouter, File, UploadFile, Form, HTTPException, status
+from fastapi.responses import JSONResponse
+from config import settings
+from PIL import Image
+import urllib.request
+from io import BytesIO
+import utils
+import os
+import time
+from functools import lru_cache
+from paddleocr import PaddleOCR
+from pdf2image import convert_from_bytes
+import io
+import json
+from routers.data_utils import merge_data
+from routers.data_utils import store_data
+import motor.motor_asyncio
+from typing import Optional
+from pymongo import ASCENDING
+from pymongo.errors import DuplicateKeyError
+router = APIRouter()
+client = None
+db = None
+async def create_unique_index(collection, *fields):
+    index_fields = [(field, 1) for field in fields]
+    return await collection.create_index(index_fields, unique=True)
+async def create_ttl_index(db, collection_name, field, expire_after_seconds):
+    # Get a reference to your collection
+    collection = db[collection_name]
+    # Create an index on the specified field
+    index_result = await collection.create_index([(field, ASCENDING)], expireAfterSeconds=expire_after_seconds)
+    print(f"TTL index created or already exists: {index_result}")
+@router.on_event("startup")
+async def startup_event():
+    if "MONGODB_URL" in os.environ:
+        global client
+        global db
+        client = motor.motor_asyncio.AsyncIOMotorClient(os.environ["MONGODB_URL"])
+        db = client.chatgpt_plugin
+        index_result = await create_unique_index(db['uploads'], 'receipt_key')
+        print(f"Unique index created or already exists: {index_result}")
+        index_result = await create_unique_index(db['receipts'], 'user', 'receipt_key')
+        print(f"Unique index created or already exists: {index_result}")
+        await create_ttl_index(db, 'uploads', 'created_at', 15*60)
+        print("Connected to MongoDB from OCR!")
+@router.on_event("shutdown")
+async def shutdown_event():
+    if "MONGODB_URL" in os.environ:
+        global client
+        client.close()
+@lru_cache(maxsize=1)
+def load_ocr_model():
+    model = PaddleOCR(use_angle_cls=True, lang='en')
+    return model
+def invoke_ocr(doc, content_type):
+    worker_pid = os.getpid()
+    print(f"Handling OCR request with worker PID: {worker_pid}")
+    start_time = time.time()
+    model = load_ocr_model()
+    bytes_img = io.BytesIO()
+    format_img = "JPEG"
+    if content_type == "image/png":
+        format_img = "PNG"
+    doc.save(bytes_img, format=format_img)
+    bytes_data = bytes_img.getvalue()
+    bytes_img.close()
+    result = model.ocr(bytes_data, cls=True)
+    values = []
+    for idx in range(len(result)):
+        res = result[idx]
+        for line in res:
+            values.append(line)
+    values = merge_data(values)
+    end_time = time.time()
+    processing_time = end_time - start_time
+    print(f"OCR done, worker PID: {worker_pid}")
+    return values, processing_time
+@router.post("/ocr")
+async def run_ocr(file: Optional[UploadFile] = File(None), image_url: Optional[str] = Form(None),
+                  post_processing: Optional[bool] = Form(False), sparrow_key: str = Form(None)):
+    if sparrow_key != settings.sparrow_key:
+        return {"error": "Invalid Sparrow key."}
+    result = None
+    if file:
+        if file.content_type in ["image/jpeg", "image/jpg", "image/png"]:
+            doc = Image.open(BytesIO(await file.read()))
+        elif file.content_type == "application/pdf":
+            pdf_bytes = await file.read()
+            pages = convert_from_bytes(pdf_bytes, 300)
+            doc = pages[0]
+        else:
+            return {"error": "Invalid file type. Only JPG/PNG images and PDF are allowed."}
+        result, processing_time = invoke_ocr(doc, file.content_type)
+        utils.log_stats(settings.ocr_stats_file, [processing_time, file.filename])
+        print(f"Processing time OCR: {processing_time:.2f} seconds")
+        if post_processing and "MONGODB_URL" in os.environ:
+            print("Postprocessing...")
+            try:
+                result = await store_data(result, db)
+            except DuplicateKeyError:
+                return HTTPException(status_code=400, detail=f"Duplicate data.")
+            print(f"Stored data with key: {result}")
+    elif image_url:
+        # test image url: https://raw.githubusercontent.com/katanaml/sparrow/main/sparrow-data/docs/input/invoices/processed/images/invoice_10.jpg
+        # test PDF: https://raw.githubusercontent.com/katanaml/sparrow/main/sparrow-data/docs/input/receipts/2021/us/bestbuy-20211211_006.pdf
+        with urllib.request.urlopen(image_url) as response:
+            content_type = response.info().get_content_type()
+            if content_type in ["image/jpeg", "image/jpg", "image/png"]:
+                doc = Image.open(BytesIO(response.read()))
+            elif content_type == "application/octet-stream":
+                pdf_bytes = response.read()
+                pages = convert_from_bytes(pdf_bytes, 300)
+                doc = pages[0]
+            else:
+                return {"error": "Invalid file type. Only JPG/PNG images and PDF are allowed."}
+        result, processing_time = invoke_ocr(doc, content_type)
+        # parse file name from url
+        file_name = image_url.split("/")[-1]
+        utils.log_stats(settings.ocr_stats_file, [processing_time, file_name])
+        print(f"Processing time OCR: {processing_time:.2f} seconds")
+        if post_processing and "MONGODB_URL" in os.environ:
+            print("Postprocessing...")
+            try:
+                result = await store_data(result, db)
+            except DuplicateKeyError:
+                return HTTPException(status_code=400, detail=f"Duplicate data.")
+            print(f"Stored data with key: {result}")
+    else:
+        result = {"info": "No input provided"}
+    if result is None:
+        raise HTTPException(status_code=400, detail=f"Failed to process the input.")
+    return JSONResponse(status_code=status.HTTP_200_OK, content=result)
+@router.get("/statistics")
+async def get_statistics():
+    file_path = settings.ocr_stats_file
+    # Check if the file exists, and read its content
+    if os.path.exists(file_path):
+        with open(file_path, 'r') as file:
+            try:
+                content = json.load(file)
+            except json.JSONDecodeError:
+                content = []
+    else:
+        content = []
+    return content

utils.py ADDED Viewed

	@@ -0,0 +1,29 @@

+import json
+import os
+from datetime import datetime
+def log_stats(file_path, new_data):
+    # Check if the file exists, and read its content
+    if os.path.exists(file_path):
+        with open(file_path, 'r') as file:
+            try:
+                content = json.load(file)
+            except json.JSONDecodeError:
+                content = []
+    else:
+        content = []
+    # Get the current date and time
+    now = datetime.now()
+    # Format the date and time as a string
+    date_time_string = now.strftime("%Y-%m-%d %H:%M:%S")
+    new_data.append(date_time_string)
+    # Append the new data to the content
+    content.append(new_data)
+    # Write the updated content back to the file
+    with open(file_path, 'w') as file:
+        json.dump(content, file)