MinerU

Paused

App Files Files Community

SkyNait commited on Feb 28

Commit

99cd3b7

verified ·

1 Parent(s): da9ad0b

fix long gemini calls (async)

Browse files

Files changed (1) hide show

topic_extraction.py +223 -178

topic_extraction.py CHANGED Viewed

@@ -6,7 +6,8 @@ import json
 import logging
 import fitz
 import base64
-import concurrent.futures
 from io import BytesIO
 from typing import List, Dict, Any
@@ -19,19 +20,123 @@ from google.genai import types
 from magic_pdf.data.dataset import PymuDocDataset
 from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
 from table_row_extraction import TableExtractor
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
 _GEMINI_CLIENT = None
-#to store and reuse a single Gemini client instance instead of reinitializing
 def unify_whitespace(text: str) -> str:
     return re.sub(r"\s+", " ", text).strip().lower()
 def create_subset_pdf(original_pdf_bytes: bytes, page_indices: List[int]) -> bytes:
     if not page_indices:
         raise ValueError("No page indices provided for subset creation.")
@@ -48,41 +153,23 @@ def create_subset_pdf(original_pdf_bytes: bytes, page_indices: List[int]) -> byt
     doc.close()
     return subset_bytes
-def find_all_occurrences(pdf_bytes: bytes, search_text: str) -> List[int]:
-    """
-    Return a sorted list of 0-based pages in which `search_text` (normalized) appears,
-    scanning the entire PDF in RAW mode.
-    """
-    doc = fitz.open(stream=pdf_bytes, filetype="pdf")
-    st_norm = unify_whitespace(search_text)
-    found = []
-    for i in range(doc.page_count):
-        raw = doc[i].get_text("raw")
-        norm = unify_whitespace(raw)
-        if st_norm in norm:
-            found.append(i)
-    doc.close()
-    return sorted(found)
 class GeminiTopicExtractor:
     def __init__(self, api_key: str = None, num_pages: int = 10):
         self.api_key = api_key or os.getenv("GEMINI_API_KEY", "")
         self.num_pages = num_pages
     def extract_subtopics(self, pdf_path: str) -> Dict[str, List[int]]:
-        """
-        Return a dict of subtopics => [start_page, end_page].
-        """
         first_pages_text = self._read_first_pages_raw(pdf_path, self.num_pages)
         if not first_pages_text.strip():
             logger.error("No text from first pages => cannot extract subtopics.")
             return {}
         prompt = f"""
 You have the first pages of a PDF specification, including a table of contents.
 Instructions:
 1. Identify the 'Contents' section listing all topics, subtopics, and their corresponding pages.
-2. Identify the major academic subtopics (common desired topic names "Paper X", "Theme X", "Content of X").
 3. For each subtopic, give the range of pages [start_page, end_page] (1-based) from the table of contents.
 4. Output only valid JSON of the form:
     {{
@@ -92,7 +179,8 @@ Instructions:
 5. If you can't find any subtopics, return an empty JSON.
 Important notes:
-- the correct "end_page" must be the page number of the next topic or subtopic minus 1.
 Examples:
@@ -166,6 +254,54 @@ The correct output should be:
     "Theme 4: A global perspective": [29, 38]
 }}
 Now, extract topics from this text:
 {first_pages_text}
 """
@@ -185,25 +321,15 @@ Now, extract topics from this text:
             raw_json = response.text.strip()
             cleaned = raw_json.replace("```json", "").replace("```", "")
-            # Attempt to parse
-            data = json.loads(cleaned)
-            # data might be nested or flat
-            # if nested, example {"2 Subject content": {"Paper 1...": [11,29]}}
-            # if flat, example {"Paper 1...": [11,29]}
-            # so we unify it to a single dict of subname => [start,end].
             final_dict = {}
-            # If the top-level is a dict of dict
-            # We look for a dict whose values are themselves subtopics
-            # Or it might be a direct subtopic dict
-            # We'll try a quick approach:
-            #   - If any top-level value is a dict with numeric arrays, use that
-            #   - else assume data is the direct subtopic dict
             found_sub_dict = None
             for k, v in data.items():
                 if isinstance(v, dict):
-                    # might be the sub-sub dict
                     found_sub_dict = v
                     break
             if found_sub_dict is not None:
@@ -211,8 +337,6 @@ Now, extract topics from this text:
                     if isinstance(rng, list) and len(rng) == 2:
                         final_dict[subk] = rng
             else:
-                # maybe data is the direct subtopic dict
-                # parse data
                 for subk, rng in data.items():
                     if isinstance(rng, list) and len(rng) == 2:
                         final_dict[subk] = rng
@@ -234,124 +358,36 @@ Now, extract topics from this text:
             logger.error(f"Could not open PDF: {e}")
         return "\n".join(text_parts)
-def call_gemini_for_table_classification(image_data: bytes, api_key: str) -> str:
-    """
-    Classify an image as TWO_COLUMN, THREE_COLUMN, or NO_TABLE
-    """
-    #shrink image to reduce size
-    try:
-        arr = np.frombuffer(image_data, np.uint8)
-        img = cv2.imdecode(arr, cv2.IMREAD_COLOR)
-        if img is not None:
-            h, w, _ = img.shape
-            max_dim = 800
-            scale = 1.0
-            if max(h, w) > max_dim:
-                scale = max_dim / float(max(h, w))
-            if scale < 1.0:
-                new_w = int(w * scale)
-                new_h = int(h * scale)
-                img = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_AREA)
-            encode_params = [int(cv2.IMWRITE_JPEG_QUALITY), 70]
-            success, enc = cv2.imencode(".jpg", img, encode_params)
-            if success:
-                image_data = enc.tobytes()
-    except Exception as e:
-        logger.warning(f"shrink_image_to_jpeg error: {e}")
-    prompt = """You are given an image. Determine if it shows a table that has exactly 2 or 3 columns.
-The three-column 'table' image include such key features:
-    - Three columns header columns
-    - Headers like 'Topics', 'Content', 'Guidelines'
-    - Numbered sections (e.g., 8.4, 9.1)
-    - Educational curriculum-style structure
-The two-column 'table' image include such key features:
-    - Two columns header columns
-    - Headers like 'Subject content' and 'Additional information'
-    - Numbered sections (e.g., 2.1, 3.4)
-    - Educational curriculum-style structure
-    - Bullet description in 'Additional information'
-If the image is a relevant table with 2 columns, respond with 'TWO_COLUMN'.
-If the image is a relevant table with 3 columns, respond with 'THREE_COLUMN'.
-If the image does not show a table at all, respond with 'NO_TABLE'.
-Return only one of these exact labels as your entire response:
-TWO_COLUMN
-THREE_COLUMN
-NO_TABLE
-"""
-    global _GEMINI_CLIENT
-    if _GEMINI_CLIENT is None:
-        _GEMINI_CLIENT = genai.Client(api_key=api_key)
-    client = _GEMINI_CLIENT
-    try:
-        resp = client.models.generate_content(
-            model="gemini-2.0-flash",
-            contents=[
-                {
-                    "parts": [
-                        {"text": prompt},
-                        {
-                            "inline_data": {
-                                "mime_type": "image/jpeg",
-                                "data": base64.b64encode(image_data).decode('utf-8')
-                            }
-                        }
-                    ]
-                }
-            ],
-            config=types.GenerateContentConfig(temperature=0.0)
-        )
-        if resp and resp.text:
-            classification = resp.text.strip().upper()
-            if "THREE" in classification:
-                return "THREE_COLUMN"
-            elif "TWO" in classification:
-                return "TWO_COLUMN"
-        return "NO_TABLE"
-    except Exception as e:
-        logger.error(f"Gemini table classification error: {e}")
-        return "NO_TABLE"
-class LocalImageWriter:
-    """
-    Writes extracted images, then does concurrency-based table classification calls.
-    """
     def __init__(self, output_folder: str, gemini_api_key: str):
         self.output_folder = output_folder
         os.makedirs(self.output_folder, exist_ok=True)
-        self.images_dir = os.path.join(self.output_folder, "images")
-        os.makedirs(self.images_dir, exist_ok=True)
         self.descriptions = {}
         self._img_count = 0
         self.gemini_api_key = gemini_api_key
     def write(self, path: str, data: bytes) -> None:
         self._img_count += 1
-        fname = f"img_{self._img_count}.png"
-        fpath = os.path.join(self.images_dir, fname)
-        with open(fpath, "wb") as f:
-            f.write(data)
-        rel_path = os.path.relpath(fpath, self.output_folder)
         self.descriptions[path] = {
             "data": data,
-            "relative_path": rel_path,
             "table_classification": "NO_TABLE",
             "final_alt": ""
         }
-    def post_process(self, key: str, md_content: str) -> str:
-        logger.info("Classifying images to detect tables (concurrent)...")
-        with concurrent.futures.ThreadPoolExecutor(max_workers=6) as exe:
-            fut_map = {exe.submit(call_gemini_for_table_classification, info["data"], self.gemini_api_key): p for p, info in self.descriptions.items()}
-            for fut in concurrent.futures.as_completed(fut_map):
-                path = fut_map[fut]
-                try:
-                    classification = fut.result()
-                    self.descriptions[path]['table_classification'] = classification
-                except Exception as e:
-                    logger.error(f"Table classification error: {e}")
-                    self.descriptions[path]['table_classification'] = "NO_TABLE"
         for p, info in self.descriptions.items():
             cls = info['table_classification']
             if cls == "TWO_COLUMN":
@@ -360,32 +396,43 @@ class LocalImageWriter:
                 info['final_alt'] = "HAS TO BE PROCESSED - three column table"
             else:
                 info['final_alt'] = "NO_TABLE image"
-        #replace placeholders in the Markdown
         for p, info in self.descriptions.items():
             old_md = f"![]({key}{p})"
             new_md = f"![{info['final_alt']}]({info['relative_path']})"
             md_content = md_content.replace(old_md, new_md)
-        # IF any table images => extract rows
         md_content = self._process_table_images_in_markdown(md_content)
-        # Keep only lines that are image references
         final_lines = []
         for line in md_content.split("\n"):
             if re.match(r"^\!\[.*\]\(.*\)", line.strip()):
                 final_lines.append(line.strip())
         return "\n".join(final_lines)
     def _process_table_images_in_markdown(self, md_content: str) -> str:
         pat = r"!\[HAS TO BE PROCESSED - (two|three) column table\]\(([^)]+)\)"
         matches = re.findall(pat, md_content, flags=re.IGNORECASE)
         if not matches:
             return md_content
-        for (col_type, image_path) in matches:
-            logger.info(f"Processing table image => {image_path}, columns={col_type}")
-            abs_image_path = os.path.join(self.output_folder, image_path)
             try:
                 if col_type.lower() == 'two':
                     extractor = TableExtractor(
@@ -401,25 +448,31 @@ class LocalImageWriter:
                         enable_subtopic_merge=False,
                         subtopic_threshold=0.2
                     )
-                row_boxes = extractor.process_image(abs_image_path)
-                out_folder = abs_image_path + "_rows"
                 os.makedirs(out_folder, exist_ok=True)
-                extractor.save_extracted_cells(abs_image_path, row_boxes, out_folder)
                 snippet = ["**Extracted table cells:**"]
                 for i, row in enumerate(row_boxes):
                     row_dir = os.path.join(out_folder, f"row_{i}")
                     for j, _ in enumerate(row):
-                        cell_file = f"col_{j}.png"
                         cell_path = os.path.join(row_dir, cell_file)
                         relp = os.path.relpath(cell_path, self.output_folder)
                         snippet.append(f"![Row {i} Col {j}]({relp})")
                 new_snip = "\n".join(snippet)
-                old_line = f"![HAS TO BE PROCESSED - {col_type} column table]({image_path})"
                 md_content = md_content.replace(old_line, new_snip)
             except Exception as e:
-                logger.error(f"Error processing table image {image_path}: {e}")
         return md_content
 class MineruNoTextProcessor:
@@ -442,14 +495,14 @@ class MineruNoTextProcessor:
             logger.error(f"Error during GPU cleanup: {e}")
     def process(self, pdf_path: str) -> str:
         try:
-            #Extract subtopics from Gemini
             subtopics = self.subtopic_extractor.extract_subtopics(pdf_path)
-            logger.info(f"Gemini returned subtopics: {subtopics}")
-            #read whole pdf
             with open(pdf_path, "rb") as f:
                 pdf_bytes = f.read()
             doc = fitz.open(stream=pdf_bytes, filetype="pdf")
             total_pages = doc.page_count
             doc.close()
@@ -459,17 +512,14 @@ class MineruNoTextProcessor:
                 logger.warning("No subtopics found. We'll process the entire PDF as fallback.")
                 final_pages = set(range(total_pages))
             else:
-                # For each subtopic, find occurrence >= (start_p-1)
                 for subname, rng in subtopics.items():
                     if not (isinstance(rng, list) and len(rng) == 2):
                         logger.warning(f"Skipping subtopic '{subname}' => invalid range {rng}")
                         continue
                     start_p, end_p = rng
                     if start_p > end_p:
-                        logger.warning(f"Skipping subtopic '{subname}' => start> end {rng}")
                         continue
-                    # find occurrences
                     occs = find_all_occurrences(pdf_bytes, subname)
                     logger.info(f"Occurrences of subtopic '{subname}': {occs}")
                     doc_start_0 = start_p - 1
@@ -479,7 +529,6 @@ class MineruNoTextProcessor:
                             chosen_page = p
                             break
                     if chosen_page is None:
-                        # fallback to last or 0
                         if occs:
                             chosen_page = occs[-1]
                             logger.warning(f"No occurrence >= {doc_start_0} for '{subname}'. Using last => {chosen_page}")
@@ -496,15 +545,13 @@ class MineruNoTextProcessor:
                     e0 = max(0, min(total_pages - 1, e0))
                     for pp in range(s0, e0 + 1):
                         final_pages.add(pp)
             if not final_pages:
                 logger.warning("No valid pages after offset. We'll process entire PDF.")
                 final_pages = set(range(total_pages))
             logger.info(f"Processing pages (0-based): {sorted(final_pages)}")
             subset_pdf_bytes = create_subset_pdf(pdf_bytes, sorted(final_pages))
-            # doc_analyze => concurrency => final MD
             dataset = PymuDocDataset(subset_pdf_bytes)
             inference = doc_analyze(
                 dataset,
@@ -515,18 +562,18 @@ class MineruNoTextProcessor:
                 table_enable=self.table_enable
             )
             logger.info("doc_analyze complete. Extracting images.")
             writer = LocalImageWriter(self.output_folder, self.gemini_api_key)
             pipe_result = inference.pipe_ocr_mode(writer, lang=self.language)
             md_content = pipe_result.get_markdown("local-unique-prefix/")
             final_markdown = writer.post_process("local-unique-prefix/", md_content)
             out_path = os.path.join(self.output_folder, "final_output.md")
             with open(out_path, "w", encoding="utf-8") as f:
                 f.write(final_markdown)
-            logger.info(f"Markdown saved to: {out_path}")
             return final_markdown
         finally:
             self.cleanup_gpu()
@@ -537,7 +584,5 @@ if __name__ == "__main__":
     try:
         processor = MineruNoTextProcessor(output_folder=output_dir, gemini_api_key=gemini_key)
         md_output = processor.process(input_pdf)
-        print("Final Markdown Output:")
-        print(md_output)
     except Exception as e:
         logger.error(f"Processing failed: {e}")

 import logging
 import fitz
 import base64
+import time
+import asyncio
 from io import BytesIO
 from typing import List, Dict, Any
 from magic_pdf.data.dataset import PymuDocDataset
 from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
+from magic_pdf.data.data_reader_writer.base import DataWriter
 from table_row_extraction import TableExtractor
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
+file_handler = logging.FileHandler("topic_extraction.log")
+file_handler.setFormatter(logging.Formatter("%(asctime)s [%(levelname)s] %(name)s - %(message)s"))
+logger.addHandler(file_handler)
 _GEMINI_CLIENT = None
+def preprocess_image(image_data: bytes, max_dim: int = 600, quality: int = 60) -> bytes:
+    """
+    Downscale the image to reduce payload size.
+    """
+    arr = np.frombuffer(image_data, np.uint8)
+    img = cv2.imdecode(arr, cv2.IMREAD_COLOR)
+    if img is not None:
+        h, w, _ = img.shape
+        if max(h, w) > max_dim:
+            scale = max_dim / float(max(h, w))
+            new_w = int(w * scale)
+            new_h = int(h * scale)
+            img = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_AREA)
+        encode_params = [int(cv2.IMWRITE_JPEG_QUALITY), quality]
+        success, enc = cv2.imencode(".jpg", img, encode_params)
+        if success:
+            return enc.tobytes()
+    return image_data
+def call_gemini_for_table_classification(image_data: bytes, api_key: str, max_retries: int = 1) -> str:
+    """
+    Synchronously call the Gemini API to classify a table image.
+    """
+    for attempt in range(max_retries + 1):
+        try:
+            prompt = """You are given an image. Determine if it shows a table that has exactly 2 or 3 columns.
+The three-column 'table' image include such key features:
+    - Three columns header columns
+    - Headers like 'Topics', 'Content', 'Guidelines'
+    - Numbered sections (e.g., 8.4, 9.1)
+    - Educational curriculum-style structure
+The two-column 'table' image include such key features:
+    - Two columns header columns
+    - Headers like 'Subject content' and 'Additional information'
+    - Numbered sections (e.g., 2.1, 3.4)
+    - Educational curriculum-style structure
+    - Bullet description in 'Additional information'
+If the image is a relevant table with 2 columns, respond with 'TWO_COLUMN'.
+If the image is a relevant table with 3 columns, respond with 'THREE_COLUMN'.
+If the image does not show a table at all, respond with 'NO_TABLE'.
+Return only one of these exact labels as your entire response:
+TWO_COLUMN
+THREE_COLUMN
+NO_TABLE
+"""
+            global _GEMINI_CLIENT
+            client = _GEMINI_CLIENT
+            resp = client.models.generate_content(
+                model="gemini-2.0-flash",
+                contents=[
+                    {
+                        "parts": [
+                            {"text": prompt},
+                            {
+                                "inline_data": {
+                                    "mime_type": "image/jpeg",
+                                    "data": base64.b64encode(image_data).decode('utf-8')
+                                }
+                            }
+                        ]
+                    }
+                ],
+                config=types.GenerateContentConfig(temperature=0.0)
+            )
+            if resp and resp.text:
+                classification = resp.text.strip().upper()
+                if "THREE" in classification:
+                    return "THREE_COLUMN"
+                elif "TWO" in classification:
+                    return "TWO_COLUMN"
+            return "NO_TABLE"
+        except Exception as e:
+            error_msg = str(e)
+            logger.error(f"Gemini table classification error: {error_msg}")
+            if "503" in error_msg:
+                return "NO_TABLE"
+            if attempt < max_retries:
+                logger.warning("Retrying classification due to error... attempt %d", attempt + 1)
+                time.sleep(0.5)
+            else:
+                return "NO_TABLE"
+async def classify_image_async(image_data: bytes, api_key: str, max_retries: int = 1) -> str:
+    """
+    Asynchronous wrapper for image classification.
+    """
+    loop = asyncio.get_event_loop()
+    preprocessed = preprocess_image(image_data)
+    return await loop.run_in_executor(None, call_gemini_for_table_classification, preprocessed, api_key, max_retries)
 def unify_whitespace(text: str) -> str:
     return re.sub(r"\s+", " ", text).strip().lower()
+def find_all_occurrences(pdf_bytes: bytes, search_text: str) -> List[int]:
+    doc = fitz.open(stream=pdf_bytes, filetype="pdf")
+    st_norm = unify_whitespace(search_text)
+    found = []
+    for i in range(doc.page_count):
+        raw = doc[i].get_text("raw")
+        norm = unify_whitespace(raw)
+        if st_norm in norm:
+            found.append(i)
+    doc.close()
+    return sorted(found)
 def create_subset_pdf(original_pdf_bytes: bytes, page_indices: List[int]) -> bytes:
     if not page_indices:
         raise ValueError("No page indices provided for subset creation.")
     doc.close()
     return subset_bytes
 class GeminiTopicExtractor:
     def __init__(self, api_key: str = None, num_pages: int = 10):
         self.api_key = api_key or os.getenv("GEMINI_API_KEY", "")
         self.num_pages = num_pages
     def extract_subtopics(self, pdf_path: str) -> Dict[str, List[int]]:
         first_pages_text = self._read_first_pages_raw(pdf_path, self.num_pages)
         if not first_pages_text.strip():
             logger.error("No text from first pages => cannot extract subtopics.")
             return {}
         prompt = f"""
 You have the first pages of a PDF specification, including a table of contents.
 Instructions:
 1. Identify the 'Contents' section listing all topics, subtopics, and their corresponding pages.
+2. Identify the major academic subtopics (common desired topic names "Paper X", "Theme X", "Content of X", "AS Unit X", "A2 Unit X", or similar headings).
 3. For each subtopic, give the range of pages [start_page, end_page] (1-based) from the table of contents.
 4. Output only valid JSON of the form:
     {{
 5. If you can't find any subtopics, return an empty JSON.
 Important notes:
+- The correct "end_page" must be the page number of the next topic or subtopic minus 1.
+- The final output must be valid JSON only, with no extra text or code blocks.
 Examples:
     "Theme 4: A global perspective": [29, 38]
 }}
+3. You might also see sections like:
+2.1 AS Unit 1 11
+2.2 AS Unit 2 18
+2.3 A2 Unit 3 24
+2.4 A2 Unit 4 31
+In that scenario, your output might look like:
+{{
+    "AS Unit 1": [11, 17],
+    "AS Unit 2": [18, 23],
+    "A2 Unit 3": [24, 30],
+    "A2 Unit 4": [31, 35]
+}}
+4. Another example might list subtopics:
+3.1 Overarching themes 11
+3.2 A: Proof 12
+3.3 B: Algebra and functions 13
+3.4 C: Coordinate geometry in the ( x , y ) plane 14
+3.5 D: Sequences and series 15
+3.6 E: Trigonometry 16
+3.7 F: Exponentials and logarithms 17
+3.8 G: Differentiation 18
+3.9 H: Integration 19
+3.10 I: Numerical methods 20
+3.11 J: Vectors 20
+3.12 K: Statistical sampling 21
+3.13 L: Data presentation and interpretation 21
+3.14 M: Probability 22
+3.15 N: Statistical distributions 23
+3.16 O: Statistical hypothesis testing 23
+3.17 P: Quantities and units in mechanics 24
+3.18 Q: Kinematics 24
+3.19 R: Forces and Newton’s laws 24
+3.20 S: Moments 25
+3.21 Use of data in statistics 26
+Here the correct output might look like:
+{{
+    "A: Proof": [12, 12],
+    "B: Algebra and functions": [13, 13],
+    ...
+}}
 Now, extract topics from this text:
 {first_pages_text}
 """
             raw_json = response.text.strip()
             cleaned = raw_json.replace("```json", "").replace("```", "")
+            try:
+                data = json.loads(cleaned)
+            except Exception as json_err:
+                logger.error(f"JSON parsing error: {json_err}")
+                return {}
             final_dict = {}
             found_sub_dict = None
             for k, v in data.items():
                 if isinstance(v, dict):
                     found_sub_dict = v
                     break
             if found_sub_dict is not None:
                     if isinstance(rng, list) and len(rng) == 2:
                         final_dict[subk] = rng
             else:
                 for subk, rng in data.items():
                     if isinstance(rng, list) and len(rng) == 2:
                         final_dict[subk] = rng
             logger.error(f"Could not open PDF: {e}")
         return "\n".join(text_parts)
+class LocalImageWriter(DataWriter):
     def __init__(self, output_folder: str, gemini_api_key: str):
         self.output_folder = output_folder
         os.makedirs(self.output_folder, exist_ok=True)
         self.descriptions = {}
         self._img_count = 0
         self.gemini_api_key = gemini_api_key
     def write(self, path: str, data: bytes) -> None:
         self._img_count += 1
+        unique_id = f"img_{self._img_count}.jpg"
         self.descriptions[path] = {
             "data": data,
+            "relative_path": unique_id,
             "table_classification": "NO_TABLE",
             "final_alt": ""
         }
+    async def post_process_async(self, key: str, md_content: str) -> str:
+        logger.info("Classifying images to detect tables (async).")
+        tasks = []
+        for p, info in self.descriptions.items():
+            tasks.append((p, classify_image_async(info["data"], self.gemini_api_key)))
+        for p, task in tasks:
+            try:
+                classification = await task
+                self.descriptions[p]['table_classification'] = classification
+            except Exception as e:
+                logger.error(f"Table classification error: {e}")
+                self.descriptions[p]['table_classification'] = "NO_TABLE"
         for p, info in self.descriptions.items():
             cls = info['table_classification']
             if cls == "TWO_COLUMN":
                 info['final_alt'] = "HAS TO BE PROCESSED - three column table"
             else:
                 info['final_alt'] = "NO_TABLE image"
         for p, info in self.descriptions.items():
             old_md = f"![]({key}{p})"
             new_md = f"![{info['final_alt']}]({info['relative_path']})"
             md_content = md_content.replace(old_md, new_md)
         md_content = self._process_table_images_in_markdown(md_content)
         final_lines = []
         for line in md_content.split("\n"):
             if re.match(r"^\!\[.*\]\(.*\)", line.strip()):
                 final_lines.append(line.strip())
         return "\n".join(final_lines)
+    def post_process(self, key: str, md_content: str) -> str:
+        """
+        Synchronous wrapper around the asynchronous post_process_async.
+        """
+        return asyncio.run(self.post_process_async(key, md_content))
     def _process_table_images_in_markdown(self, md_content: str) -> str:
         pat = r"!\[HAS TO BE PROCESSED - (two|three) column table\]\(([^)]+)\)"
         matches = re.findall(pat, md_content, flags=re.IGNORECASE)
         if not matches:
             return md_content
+        for (col_type, image_id) in matches:
+            logger.info(f"Processing table image => {image_id}, columns={col_type}")
+            temp_path = os.path.join(self.output_folder, image_id)
+            desc_item = None
+            for k, val in self.descriptions.items():
+                if val["relative_path"] == image_id:
+                    desc_item = val
+                    break
+            if not desc_item:
+                logger.warning(f"No matching image data for {image_id}, skipping extraction.")
+                continue
+            if not os.path.exists(temp_path):
+                with open(temp_path, "wb") as f:
+                    f.write(desc_item["data"])
             try:
                 if col_type.lower() == 'two':
                     extractor = TableExtractor(
                         enable_subtopic_merge=False,
                         subtopic_threshold=0.2
                     )
+                row_boxes = extractor.process_image(temp_path)
+                out_folder = temp_path + "_rows"
                 os.makedirs(out_folder, exist_ok=True)
+                extractor.save_extracted_cells(temp_path, row_boxes, out_folder)
                 snippet = ["**Extracted table cells:**"]
                 for i, row in enumerate(row_boxes):
                     row_dir = os.path.join(out_folder, f"row_{i}")
                     for j, _ in enumerate(row):
+                        cell_file = f"col_{j}.jpg"
                         cell_path = os.path.join(row_dir, cell_file)
                         relp = os.path.relpath(cell_path, self.output_folder)
                         snippet.append(f"![Row {i} Col {j}]({relp})")
                 new_snip = "\n".join(snippet)
+                old_line = f"![HAS TO BE PROCESSED - {col_type} column table]({image_id})"
                 md_content = md_content.replace(old_line, new_snip)
             except Exception as e:
+                logger.error(f"Error processing table image {image_id}: {e}")
+            finally:
+                if os.path.exists(temp_path):
+                    os.remove(temp_path)
         return md_content
 class MineruNoTextProcessor:
             logger.error(f"Error during GPU cleanup: {e}")
     def process(self, pdf_path: str) -> str:
+        logger.info(f"Processing PDF: {pdf_path}")
         try:
             subtopics = self.subtopic_extractor.extract_subtopics(pdf_path)
+            logger.info(f"Gemini returned subtopics: {subtopics}")
             with open(pdf_path, "rb") as f:
                 pdf_bytes = f.read()
             doc = fitz.open(stream=pdf_bytes, filetype="pdf")
             total_pages = doc.page_count
             doc.close()
                 logger.warning("No subtopics found. We'll process the entire PDF as fallback.")
                 final_pages = set(range(total_pages))
             else:
                 for subname, rng in subtopics.items():
                     if not (isinstance(rng, list) and len(rng) == 2):
                         logger.warning(f"Skipping subtopic '{subname}' => invalid range {rng}")
                         continue
                     start_p, end_p = rng
                     if start_p > end_p:
+                        logger.warning(f"Skipping subtopic '{subname}' => start > end {rng}")
                         continue
                     occs = find_all_occurrences(pdf_bytes, subname)
                     logger.info(f"Occurrences of subtopic '{subname}': {occs}")
                     doc_start_0 = start_p - 1
                             chosen_page = p
                             break
                     if chosen_page is None:
                         if occs:
                             chosen_page = occs[-1]
                             logger.warning(f"No occurrence >= {doc_start_0} for '{subname}'. Using last => {chosen_page}")
                     e0 = max(0, min(total_pages - 1, e0))
                     for pp in range(s0, e0 + 1):
                         final_pages.add(pp)
             if not final_pages:
                 logger.warning("No valid pages after offset. We'll process entire PDF.")
                 final_pages = set(range(total_pages))
             logger.info(f"Processing pages (0-based): {sorted(final_pages)}")
             subset_pdf_bytes = create_subset_pdf(pdf_bytes, sorted(final_pages))
             dataset = PymuDocDataset(subset_pdf_bytes)
             inference = doc_analyze(
                 dataset,
                 table_enable=self.table_enable
             )
             logger.info("doc_analyze complete. Extracting images.")
             writer = LocalImageWriter(self.output_folder, self.gemini_api_key)
             pipe_result = inference.pipe_ocr_mode(writer, lang=self.language)
             md_content = pipe_result.get_markdown("local-unique-prefix/")
             final_markdown = writer.post_process("local-unique-prefix/", md_content)
             out_path = os.path.join(self.output_folder, "final_output.md")
             with open(out_path, "w", encoding="utf-8") as f:
                 f.write(final_markdown)
             return final_markdown
         finally:
             self.cleanup_gpu()
     try:
         processor = MineruNoTextProcessor(output_folder=output_dir, gemini_api_key=gemini_key)
         md_output = processor.process(input_pdf)
     except Exception as e:
         logger.error(f"Processing failed: {e}")