MinerU

Paused

App Files Files Community

princhman commited on Feb 18

Commit

0aedcf3

1 Parent(s): 78a8154

final update of the logic

Browse files

Files changed (7) hide show

__pycache__/inference_svm_model.cpython-310.pyc +0 -0
__pycache__/mineru_single.cpython-310.pyc +0 -0
__pycache__/worker.cpython-310.pyc +0 -0
app.py +0 -30
inference_svm_model.py +20 -18
mineru_single.py +8 -33
worker.py +28 -7

__pycache__/inference_svm_model.cpython-310.pyc CHANGED Viewed

Binary files a/__pycache__/inference_svm_model.cpython-310.pyc and b/__pycache__/inference_svm_model.cpython-310.pyc differ

__pycache__/mineru_single.cpython-310.pyc CHANGED Viewed

Binary files a/__pycache__/mineru_single.cpython-310.pyc and b/__pycache__/mineru_single.cpython-310.pyc differ

__pycache__/worker.cpython-310.pyc CHANGED Viewed

Binary files a/__pycache__/worker.cpython-310.pyc and b/__pycache__/worker.cpython-310.pyc differ

app.py CHANGED Viewed

@@ -24,36 +24,6 @@ app.add_middleware(
 async def root():
     return {"status": "ok", "message": "API is running"}
-@app.post("/process")
-async def process_pdf(
-    input_json: dict = Body(...),
-    x_api_key: str = Header(None, alias="X-API-Key")
-):
-    if not x_api_key:
-        raise HTTPException(status_code=401, detail="API key is missing")
-    if x_api_key != API_KEY:
-        raise HTTPException(status_code=401, detail="Invalid API key")
-    # Connect to RabbitMQ
-    rabbit_url = os.getenv("RABBITMQ_URL")
-    connection = pika.BlockingConnection(pika.URLParameters(rabbit_url))
-    channel = connection.channel()
-    channel.queue_declare(queue="ml_server", durable=True)
-    channel.basic_publish(
-        exchange="",
-        routing_key="gpu_server",
-        body=json.dumps(input_json),
-        properties=pika.BasicProperties(
-            headers={"process": "topic_extraction"}
-        )
-    )
-    connection.close()
-    return {
-        "message": "Job queued",
-        "request_id": input_json.get("headers", {}).get("request_id", str(uuid.uuid4()))
-    }
 if __name__ == "__main__":
     os.system('python download_models_hf.py')

 async def root():
     return {"status": "ok", "message": "API is running"}
 if __name__ == "__main__":
     os.system('python download_models_hf.py')

inference_svm_model.py CHANGED Viewed

@@ -1,29 +1,31 @@
 #!/usr/bin/env python3
 import cv2
 import numpy as np
 from joblib import load
-def load_svm_model(model_path: str):
-    return load(model_path)
-def classify_image(
-    image_path: str,
-    loaded_model,
-    label_map: dict,
-    image_size=(128, 128)
-) -> str:
-    img = cv2.imread(image_path)
-    if img is None:
-        # If image fails to load, default to "irrelevant" or handle differently
-        return label_map[0]
-    img = cv2.resize(img, image_size)
-    x = img.flatten().reshape(1, -1)
-    pred = loaded_model.predict(x)[0]
-    return label_map[pred]
 if __name__ == "__main__":
     model = load_svm_model("/home/user/app/model_classification/svm_model.joblib")
-    label_map = {0: "irrelevant", 1: "relevant"}
-    result = classify_image("test.jpg", model, label_map)
     print("Classification result:", result)

 #!/usr/bin/env python3
 import cv2
 import numpy as np
+import os
 from joblib import load
+class SVMModel:
+    def __init__(self):
+        path = os.getenv("SVM_MODEL_PATH", "/home/user/app/model_classification/svm_model.joblib")
+        self.model = load(path)
+    def classify_image(
+        self,
+        image_bytes: bytes,
+        image_size=(128, 128)
+    ) -> int:
+        img = cv2.imdecode(np.frombuffer(image_bytes, np.uint8), cv2.IMREAD_COLOR)
+        if img is None:
+            # If image fails to load, default to "irrelevant" or handle differently
+            return 0
+        img = cv2.resize(img, image_size)
+        x = img.flatten().reshape(1, -1)
+        pred = self.model.predict(x)[0]
+        return pred
 if __name__ == "__main__":
     model = load_svm_model("/home/user/app/model_classification/svm_model.joblib")
+    result = classify_image("test.jpg", model)
     print("Classification result:", result)

mineru_single.py CHANGED Viewed

@@ -10,7 +10,7 @@ from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
 from magic_pdf.data.io.s3 import S3Writer
 from magic_pdf.data.data_reader_writer.base import DataWriter
-from inference_svm_model import load_svm_model, classify_image
 class Processor:
     def __init__(self):
@@ -21,9 +21,7 @@ class Processor:
             endpoint_url=os.getenv("S3_ENDPOINT"),
         )
-        model_path = os.getenv("SVM_MODEL_PATH", "/home/user/app/model_classification/svm_model.joblib")
-        self.svm_model = load_svm_model(model_path)
-        self.label_map = {0: "irrelevant", 1: "relevant"}
         with open("/home/user/magic-pdf.json", "r") as f:
             config = json.load(f)
@@ -37,7 +35,7 @@ class Processor:
         bucket = os.getenv("S3_BUCKET_NAME", "")
         self.prefix = f"{endpoint}/{bucket}/document-extracts/"
-    def process(self, file_url: str) -> str:
         logger.info("Processing file: {}", file_url)
         response = requests.get(file_url)
         if response.status_code != 200:
@@ -54,53 +52,30 @@ class Processor:
             table_enable=self.table_enable
         )
-        image_writer = ImageWriter(self.s3_writer, self.svm_model, self.label_map)
         pipe_result = inference.pipe_ocr_mode(image_writer, lang=self.language)
-        folder_name = str(uuid.uuid4())
-        md_content = pipe_result.get_markdown(self.prefix + folder_name + "/")
         # Remove references to images classified as "irrelevant"
         final_markdown = image_writer.remove_redundant_images(md_content)
         return final_markdown
-    def process_batch(self, file_urls: list[str]) -> dict:
-        results = {}
-        for url in file_urls:
-            try:
-                md = self.process(url)
-                results[url] = md
-            except Exception as e:
-                results[url] = f"Error: {str(e)}"
-        return results
 class ImageWriter(DataWriter):
     """
     Receives each extracted image. Classifies it, uploads if relevant, or flags
     it for removal if irrelevant.
     """
-    def __init__(self, s3_writer: S3Writer, svm_model, label_map):
         self.s3_writer = s3_writer
         self.svm_model = svm_model
-        self.label_map = label_map
         self._redundant_images_paths = []
     def write(self, path: str, data: bytes) -> None:
-        import tempfile
-        import os
-        import uuid
-        tmp_name = f"{uuid.uuid4()}.jpg"
-        tmp_path = os.path.join(tempfile.gettempdir(), tmp_name)
-        with open(tmp_path, "wb") as f:
-            f.write(data)
-        label_str = classify_image(tmp_path, self.svm_model, self.label_map)
-        os.remove(tmp_path)
-        if label_str == "relevant":
             # Upload to S3
             self.s3_writer.write(path, data)
         else:

 from magic_pdf.data.io.s3 import S3Writer
 from magic_pdf.data.data_reader_writer.base import DataWriter
+from inference_svm_model import SVMModel
 class Processor:
     def __init__(self):
             endpoint_url=os.getenv("S3_ENDPOINT"),
         )
+        self.svm_model = SVMModel()
         with open("/home/user/magic-pdf.json", "r") as f:
             config = json.load(f)
         bucket = os.getenv("S3_BUCKET_NAME", "")
         self.prefix = f"{endpoint}/{bucket}/document-extracts/"
+    def process(self, file_url: str, key: str) -> str:
         logger.info("Processing file: {}", file_url)
         response = requests.get(file_url)
         if response.status_code != 200:
             table_enable=self.table_enable
         )
+        image_writer = ImageWriter(self.s3_writer, self.svm_model)
         pipe_result = inference.pipe_ocr_mode(image_writer, lang=self.language)
+        md_content = pipe_result.get_markdown(self.prefix + key + "/")
         # Remove references to images classified as "irrelevant"
         final_markdown = image_writer.remove_redundant_images(md_content)
         return final_markdown
 class ImageWriter(DataWriter):
     """
     Receives each extracted image. Classifies it, uploads if relevant, or flags
     it for removal if irrelevant.
     """
+    def __init__(self, s3_writer: S3Writer, svm_model: SVMModel):
         self.s3_writer = s3_writer
         self.svm_model = svm_model
         self._redundant_images_paths = []
     def write(self, path: str, data: bytes) -> None:
+        label_str = self.svm_model.classify_image(data)
+        if label_str == 1:
             # Upload to S3
             self.s3_writer.write(path, data)
         else:

worker.py CHANGED Viewed

@@ -14,13 +14,17 @@ from mineru_single import Processor
 class RabbitMQWorker:
     def __init__(self, num_workers: int = 1):
         self.num_workers = num_workers
-        self.rabbit_url = os.getenv("RABBITMQ_URL", "amqp://guest:guest@localhost:5672/")
         self.processor = Processor()
     def publish_message(self, body_dict: dict, headers: dict):
         """Create a new connection for each publish operation"""
         try:
-            connection = pika.BlockingConnection(pika.URLParameters(self.rabbit_url))
             channel = connection.channel()
             channel.queue_declare(queue="ml_server", durable=True)
@@ -56,41 +60,58 @@ class RabbitMQWorker:
                 # Process files
                 for file in body_dict.get("input_files", []):
                     try:
-                        context = {"key": file["key"], "body": self.processor.process(file["url"])}
                         contexts.append(context)
                     except Exception as e:
                         print(f"Error processing file {file['key']}: {e}")
                         contexts.append({"key": file["key"], "body": f"Error: {str(e)}"})
                 body_dict["md_context"] = contexts
                 # Publish results
                 if self.publish_message(body_dict, headers):
                     print(f"[Worker {thread_id}] Successfully published results")
                 else:
                     print(f"[Worker {thread_id}] Failed to publish results")
                 print(f"[Worker {thread_id}] Contexts: {contexts}")
             else:
                 print(f"[Worker {thread_id}] Unknown process")
         except Exception as e:
             print(f"Error in callback: {e}")
     def connect_to_rabbitmq(self):
-        """Establish connection to RabbitMQ"""
-        connection = pika.BlockingConnection(pika.URLParameters(self.rabbit_url))
         channel = connection.channel()
         channel.queue_declare(queue="gpu_server", durable=True)
         channel.basic_qos(prefetch_count=1)
         channel.basic_consume(
             queue="gpu_server",
-            on_message_callback=self.callback,
-            auto_ack=True
         )
         return connection, channel
     def start(self):
         """Start the worker threads"""
         print(f"Starting {self.num_workers} workers")

 class RabbitMQWorker:
     def __init__(self, num_workers: int = 1):
         self.num_workers = num_workers
+        self.rabbit_url = os.getenv("RABBITMQ_URL")
         self.processor = Processor()
     def publish_message(self, body_dict: dict, headers: dict):
         """Create a new connection for each publish operation"""
         try:
+            connection_params = pika.URLParameters(self.rabbit_url)
+            connection_params.heartbeat = 10
+            connection_params.blocked_connection_timeout = 5
+            connection = pika.BlockingConnection(connection_params)
             channel = connection.channel()
             channel.queue_declare(queue="ml_server", durable=True)
                 # Process files
                 for file in body_dict.get("input_files", []):
                     try:
+                        context = {"key": file["key"], "body": self.processor.process(file["url"], file["key"])}
                         contexts.append(context)
                     except Exception as e:
                         print(f"Error processing file {file['key']}: {e}")
                         contexts.append({"key": file["key"], "body": f"Error: {str(e)}"})
                 body_dict["md_context"] = contexts
+                ch.basic_ack(delivery_tag=method.delivery_tag)
                 # Publish results
                 if self.publish_message(body_dict, headers):
                     print(f"[Worker {thread_id}] Successfully published results")
                 else:
+                    ch.basic_nack(delivery_tag=method.delivery_tag, requeue=True)
                     print(f"[Worker {thread_id}] Failed to publish results")
                 print(f"[Worker {thread_id}] Contexts: {contexts}")
             else:
+                ch.basic_nack(delivery_tag=method.delivery_tag, requeue=True)
                 print(f"[Worker {thread_id}] Unknown process")
         except Exception as e:
             print(f"Error in callback: {e}")
+            ch.basic_nack(delivery_tag=method.delivery_tag, requeue=True)
     def connect_to_rabbitmq(self):
+        """Establish connection to RabbitMQ with heartbeat"""
+        connection_params = pika.URLParameters(self.rabbit_url)
+        connection_params.heartbeat = 30
+        connection_params.blocked_connection_timeout = 10
+        connection = pika.BlockingConnection(connection_params)
         channel = connection.channel()
         channel.queue_declare(queue="gpu_server", durable=True)
         channel.basic_qos(prefetch_count=1)
         channel.basic_consume(
             queue="gpu_server",
+            on_message_callback=self.callback
         )
         return connection, channel
+    def worker(self, channel):
+        """Worker function"""
+        print(f"Worker started")
+        try:
+            channel.start_consuming()
+        except Exception as e:
+            print(f"Worker stopped: {e}")
+        finally:
+            channel.close()
     def start(self):
         """Start the worker threads"""
         print(f"Starting {self.num_workers} workers")