MinerU

Paused

App Files Files Community

SkyNait commited on Feb 26

Commit

1c9a4f5

1 Parent(s): 1805880

testing

Browse files

Files changed (2) hide show

input_output/output/images/img_28.png +0 -0
topic_extraction.py +175 -224

input_output/output/images/img_28.png CHANGED Viewed

topic_extraction.py CHANGED Viewed

@@ -5,27 +5,19 @@ import gc
 import json
 import logging
 import fitz  # PyMuPDF (pip install pymupdf)
 import base64
 import concurrent.futures
-from io import BytesIO
 from typing import List, Dict, Any
-# Attempt to import google.genai
-try:
-    from google import genai
-    from google.genai import types
-except ImportError:
-    genai = None
-    types = None
 import torch
 import cv2
-# Magic PDF pipeline
 from magic_pdf.data.dataset import PymuDocDataset
 from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
-# Your TableExtractor from topic_extraction_upgrade (or similar)
 from table_row_extraction import TableExtractor
 logging.basicConfig(level=logging.INFO)
@@ -34,130 +26,113 @@ logger.setLevel(logging.INFO)
 # -------------------------------------------------------------------
-# Helper: create a subset PDF with only desired pages
 # -------------------------------------------------------------------
-def create_subset_pdf(original_pdf_bytes: bytes, page_indices: List[int]) -> bytes:
-    """
-    Using PyMuPDF, create a new PDF that contains only the pages in page_indices (0-based).
-    Return the resulting PDF as bytes.
-    """
-    if not page_indices:
-        return original_pdf_bytes  # If empty, just return original
-    doc = fitz.open(stream=original_pdf_bytes, filetype="pdf")
-    new_doc = fitz.open()  # empty PDF to insert pages into
-    sorted_pages = sorted(set(page_indices))
-    for p in sorted_pages:
-        if 0 <= p < doc.page_count:
-            new_doc.insert_pdf(doc, from_page=p, to_page=p)
-        else:
-            logger.warning(f"Page index {p} is out of range, skipping.")
-    subset_bytes = new_doc.tobytes()
-    new_doc.close()
-    doc.close()
-    return subset_bytes
-# -------------------------------------------------------------------
-# Gemini-based subtopic extraction
-# -------------------------------------------------------------------
-class GeminiTopicExtractor:
-    """
-    Uses Gemini to parse the PDF text, looking specifically for
-    "2 Subject content and assessment information" and subtopics with pages.
-    """
-    def __init__(self, api_key: str = None):
-        self.api_key = api_key or os.getenv("GEMINI_API_KEY", "AIzaSyDtoakpXa2pjJwcQB6TJ5QaXHNSA5JxcrU")
-        if not self.api_key:
-            raise ValueError("Gemini API key not found in environment or constructor.")
         if genai is None or types is None:
-            logger.warning("google.genai is not installed. Subtopic extraction won't work.")
-    def extract_subtopics(self, pdf_path: str) -> Dict[str, Any]:
         """
-        1) Read entire PDF text
-        2) Ask Gemini for JSON structure like:
-           {
-             "2 Subject content and assessment information": {
-                "Paper 1 and Paper 2: Pure Mathematics": [11, 29],
-                "Paper 3: Statistics and Mechanics": [30, 42]
-             }
-           }
-        3) Return parsed JSON
         """
-        text_content = self._read_entire_pdf(pdf_path)
-        if not text_content.strip():
-            logger.warning("No text extracted from PDF. Returning empty JSON.")
-            return {}
         prompt = f"""
-You are given the text of a specification PDF.
-Identify the '2 Subject content and assessment information' topic.
-Under that topic, identify subtopics (like 'Paper 1 and Paper 2: Pure Mathematics', etc.)
-and their page ranges (1-based) from the text.
-Return JSON only, with structure:
-{{
-  "2 Subject content and assessment information": {{
-      "Paper 1 and Paper 2: Pure Mathematics": [start_page, end_page],
-      "Paper 3: Statistics and Mechanics": [start_page, end_page]
-  }}
-}}
-No extra explanation, just JSON.
-TEXT:
-{text_content}
         """
         try:
-            client = genai.Client(api_key=self.api_key)
-            response = client.models.generate_content(
-                model="gemini-2.0-flash",
                 contents=[prompt],
                 config=types.GenerateContentConfig(temperature=0.0)
             )
-            raw_text = response.text.strip() if response and response.text else "{}"
-            # Clean up any triple backticks
-            cleaned = raw_text.replace("```json", "").replace("```", "")
-            data = json.loads(cleaned)
-            return data
         except Exception as e:
-            logger.error(f"Error from Gemini subtopic extraction: {e}")
-            return {}
-    def _read_entire_pdf(self, pdf_path: str) -> str:
-        """
-        Return the entire PDF text by concatenating all pages.
-        """
-        text_parts = []
-        try:
-            doc = fitz.open(pdf_path)
-            for p in range(doc.page_count):
-                page_text = doc.load_page(p).get_text()
-                text_parts.append(page_text)
-            doc.close()
-        except Exception as e:
-            logger.error(f"Could not open/read PDF: {e}")
-        return "\n".join(text_parts)
 # -------------------------------------------------------------------
-# Gemini-based table classification (Mineru style)
 # -------------------------------------------------------------------
 def call_gemini_for_table_classification(image_data: bytes) -> str:
     if genai is None or types is None:
-        logger.warning("Gemini not available. Returning NO_TABLE.")
         return "NO_TABLE"
-    prompt = """You are given an image from an exam specification. Determine if it shows:
-- 'TWO_COLUMN' (2 col table),
-- 'THREE_COLUMN' (3 col table),
-- 'NO_TABLE' otherwise.
-Return only that label as entire response."""
     try:
-        client = genai.Client(api_key=os.getenv("GEMINI_API_KEY", "AIzaSyDtoakpXa2pjJwcQB6TJ5QaXHNSA5JxcrU"))
         response = client.models.generate_content(
             model="gemini-2.0-flash",
             contents=[
@@ -167,7 +142,7 @@ Return only that label as entire response."""
                         {
                             "inline_data": {
                                 "mime_type": "image/jpeg",
-                                "data": base64.b64encode(image_data).decode('utf-8')
                             }
                         }
                     ]
@@ -175,34 +150,25 @@ Return only that label as entire response."""
             ],
             config=types.GenerateContentConfig(temperature=0.0)
         )
-        classification = response.text.strip() if (response and response.text) else "NO_TABLE"
-        classification = classification.upper()
-        if "THREE" in classification:
             return "THREE_COLUMN"
-        elif "TWO" in classification:
             return "TWO_COLUMN"
         else:
             return "NO_TABLE"
     except Exception as e:
-        logger.error(f"Table classification error: {e}")
         return "NO_TABLE"
-# -------------------------------------------------------------------
-# Gemini-based image description (Mineru style)
-# -------------------------------------------------------------------
 def call_gemini_for_image_description(image_data: bytes) -> str:
     if genai is None or types is None:
-        logger.warning("Gemini not available. Returning fallback desc.")
         return "Image description unavailable"
-    prompt_text = """This image is from an exam specification.
-No text data needed, just a short 20-word max summary if no table is detected.
-If it’s an MCQ, mention 'MCQ: A [...], B [...], etc.'"""
     try:
-        client = genai.Client(api_key=os.getenv("GEMINI_API_KEY", "AIzaSyDtoakpXa2pjJwcQB6TJ5QaXHNSA5JxcrU"))
         response = client.models.generate_content(
             model="gemini-2.0-flash",
             contents=[
@@ -212,7 +178,7 @@ If it’s an MCQ, mention 'MCQ: A [...], B [...], etc.'"""
                         {
                             "inline_data": {
                                 "mime_type": "image/jpeg",
-                                "data": base64.b64encode(image_data).decode('utf-8')
                             }
                         }
                     ]
@@ -220,20 +186,21 @@ If it’s an MCQ, mention 'MCQ: A [...], B [...], etc.'"""
             ],
             config=types.GenerateContentConfig(temperature=0.0)
         )
-        return response.text.strip() if response and response.text else "Image description unavailable"
     except Exception as e:
-        logger.error(f"Gemini image description error: {e}")
         return "Image description unavailable"
 # -------------------------------------------------------------------
-# The LocalImageWriter that does table extraction
 # -------------------------------------------------------------------
 class LocalImageWriter:
     """
-    Only writes images, does table classification, then modifies final MD
-    so that we keep only table references. We do not keep any text lines.
     """
     def __init__(self, output_folder: str):
         self.output_folder = output_folder
@@ -249,7 +216,6 @@ class LocalImageWriter:
         self._img_count += 1
         local_filename = f"img_{self._img_count}.png"
         local_path = os.path.join(self.images_dir, local_filename)
         with open(local_path, "wb") as f:
             f.write(data)
@@ -263,9 +229,9 @@ class LocalImageWriter:
     def post_process(self, key: str, md_content: str) -> str:
         # 1) Table classification
-        with concurrent.futures.ThreadPoolExecutor(max_workers=len(self.descriptions)) as executor:
             fut_map = {
-                executor.submit(call_gemini_for_table_classification, info["data"]): p
                 for p, info in self.descriptions.items()
             }
             for fut in concurrent.futures.as_completed(fut_map):
@@ -274,15 +240,15 @@ class LocalImageWriter:
                     classification = fut.result()
                     self.descriptions[path]['table_classification'] = classification
                 except Exception as e:
-                    logger.error(f"[Gemini Table Classification Error for {path}]: {e}")
                     self.descriptions[path]['table_classification'] = "NO_TABLE"
-        # 2) If NO_TABLE => normal gemini-based description
-        with concurrent.futures.ThreadPoolExecutor(max_workers=len(self.descriptions)) as executor:
             fut_map2 = {}
             for p, info in self.descriptions.items():
                 if info['table_classification'] == "NO_TABLE":
-                    fut = executor.submit(call_gemini_for_image_description, info['data'])
                     fut_map2[fut] = p
             for fut in concurrent.futures.as_completed(fut_map2):
@@ -291,10 +257,10 @@ class LocalImageWriter:
                     desc = fut.result()
                     self.descriptions[path]['final_alt'] = desc
                 except Exception as e:
-                    logger.error(f"[Gemini Desc Error for {path}]: {e}")
                     self.descriptions[path]['final_alt'] = "Image description unavailable"
-        # 3) If 2/3-col => "HAS TO BE PROCESSED"
         for p, info in self.descriptions.items():
             cls = info['table_classification']
             if cls == "TWO_COLUMN":
@@ -306,22 +272,20 @@ class LocalImageWriter:
         # 4) Replace placeholders
         for p, info in self.descriptions.items():
-            old_md = f"![]({key}{p})"
-            new_md = f"![{info['final_alt']}]({info['relative_path']})"
-            md_content = md_content.replace(old_md, new_md)
-        # 5) For "HAS TO BE PROCESSED" => run TableExtractor => replace single line with row/cell lines
         md_content = self._process_table_images_in_markdown(md_content)
-        # 6) **Remove all text** => keep only lines that are image references
         final_lines = []
         for line in md_content.split("\n"):
-            # We only keep lines that start with "!" or have "!["
-            # (i.e. lines referencing images)
-            if re.match(r"^\!\[.*\]\(.*\)", line.strip()):
-                final_lines.append(line.strip())
-        new_md = "\n".join(final_lines)
-        return new_md
     def _process_table_images_in_markdown(self, md_content: str) -> str:
         pattern = r"!\[HAS TO BE PROCESSED - (two|three) column table\]\(([^)]+)\)"
@@ -350,7 +314,6 @@ class LocalImageWriter:
                 os.makedirs(out_folder, exist_ok=True)
                 extractor.save_extracted_cells(abs_image_path, row_boxes, out_folder)
-                # Build snippet
                 snippet_lines = ["**Extracted table cells:**"]
                 for i, row in enumerate(row_boxes):
                     row_dir = os.path.join(out_folder, f"row_{i}")
@@ -363,7 +326,6 @@ class LocalImageWriter:
                 new_snippet = "\n".join(snippet_lines)
                 old_line = f"![HAS TO BE PROCESSED - {col_type} column table]({image_path})"
                 md_content = md_content.replace(old_line, new_snippet)
             except Exception as e:
                 logger.error(f"Error processing table image {image_path}: {e}")
@@ -371,14 +333,14 @@ class LocalImageWriter:
 # -------------------------------------------------------------------
-# Final Processor
 # -------------------------------------------------------------------
-class MineruNoTextProcessor:
     """
-    1) Use Gemini to find '2 Subject content...' subtopics + pages
-    2) Subset PDF to those pages
-    3) doc_analyze => only images => classify tables => produce markdown with table rows only
-    4) No textual data in final markdown
     """
     def __init__(self, output_folder: str):
         self.output_folder = output_folder
@@ -389,7 +351,7 @@ class MineruNoTextProcessor:
         self.table_enable = False
         self.language = "en"
-        self.subtopic_extractor = GeminiTopicExtractor()
     def cleanup_gpu(self):
         try:
@@ -397,54 +359,64 @@ class MineruNoTextProcessor:
             torch.cuda.empty_cache()
             logger.info("GPU memory cleaned up.")
         except Exception as e:
-            logger.error(f"Error during GPU cleanup: {e}")
     def process(self, pdf_path: str) -> str:
-        """
-        1) Extract subtopics JSON from the PDF
-        2) Flatten page ranges for subtopics
-        3) Subset PDF
-        4) doc_analyze => images => produce MD with only table lines
-        5) Return final MD
-        """
         logger.info(f"Processing PDF: {pdf_path}")
         try:
-            # 1) Extract subtopics
-            data = self.subtopic_extractor.extract_subtopics(pdf_path)
-            if not data or "2 Subject content and assessment information" not in data:
-                logger.warning("Gemini did not return '2 Subject content...' or data is empty.")
-                page_indices = None
             else:
-                # 2) Flatten pages
-                page_indices = self._collect_page_indices(data["2 Subject content and assessment information"])
-            with open(pdf_path, "rb") as f:
-                original_pdf_bytes = f.read()
-            # If no pages found => entire doc
-            if page_indices:
-                # Convert from 1-based => 0-based
-                doc = fitz.open(stream=original_pdf_bytes, filetype="pdf")
-                max_p = doc.page_count
                 doc.close()
                 zero_based = []
-                for p in page_indices:
                     z = p - 1
-                    if 0 <= z < max_p:
                         zero_based.append(z)
                 zero_based = sorted(set(zero_based))
-                if zero_based:
-                    logger.info(f"Subtopic pages (0-based): {zero_based}")
-                    subset_pdf_bytes = create_subset_pdf(original_pdf_bytes, zero_based)
                 else:
-                    logger.warning("No valid subtopic pages, using entire doc.")
-                    subset_pdf_bytes = original_pdf_bytes
-            else:
-                subset_pdf_bytes = original_pdf_bytes
-            # 3) doc_analyze with subset
             dataset = PymuDocDataset(subset_pdf_bytes)
             inference = doc_analyze(
                 dataset,
@@ -456,51 +428,30 @@ class MineruNoTextProcessor:
             )
             logger.info("doc_analyze complete. Extracting images...")
-            # 4) Only images => table classification => final MD
-            image_writer = LocalImageWriter(self.output_folder)
-            pipe_result = inference.pipe_ocr_mode(image_writer, lang=self.language)
             md_content = pipe_result.get_markdown("local-unique-prefix/")
-            final_markdown = image_writer.post_process("local-unique-prefix/", md_content)
-            # 5) Save final
             md_path = os.path.join(self.output_folder, "final_output.md")
             with open(md_path, "w", encoding="utf-8") as f:
                 f.write(final_markdown)
             logger.info(f"Markdown saved to: {md_path}")
             return final_markdown
         finally:
             self.cleanup_gpu()
-    def _collect_page_indices(self, subtopic_dict: Dict[str, List[int]]) -> List[int]:
-        """
-        Given something like:
-        {
-          "Paper 1 and Paper 2: Pure Mathematics": [11, 29],
-          "Paper 3: Statistics and Mechanics": [30, 42]
-        }
-        Return [11..29, 30..42] => a flattened list of pages
-        """
-        pages = []
-        for _, rng in subtopic_dict.items():
-            if isinstance(rng, list) and len(rng) == 2:
-                start_p, end_p = rng
-                # add all pages from start to end (inclusive)
-                for p in range(start_p, end_p + 1):
-                    pages.append(p)
-        return pages
 # -------------------------------------------------------------------
 # Example usage
 # -------------------------------------------------------------------
 if __name__ == "__main__":
     input_pdf = "/home/user/app/input_output/a-level-pearson-mathematics-specification.pdf"
-    output_dir = "/home/user/app/input_output/output"
-    processor = MineruNoTextProcessor(output_folder=output_dir)
     final_md = processor.process(input_pdf)
-    # print("\n=== FINAL MARKDOWN (TABLE ROWS ONLY) ===\n")
-    # print(final_md)

 import json
 import logging
 import fitz  # PyMuPDF (pip install pymupdf)
+import requests
 import base64
 import concurrent.futures
 from typing import List, Dict, Any
 import torch
 import cv2
+# magic-pdf
 from magic_pdf.data.dataset import PymuDocDataset
 from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
+# TableExtractor from your "topic_extraction_upgrade.py"
 from table_row_extraction import TableExtractor
 logging.basicConfig(level=logging.INFO)
 # -------------------------------------------------------------------
+# 1) "ContentsExtractor" approach (similar to contents_extractor_v2)
 # -------------------------------------------------------------------
+try:
+    from google import genai
+    from google.genai import types
+except ImportError:
+    genai = None
+    types = None
+GEMINI_API_KEY = "AIzaSyDtoakpXa2pjJwcQB6TJ5QaXHNSA5JxcrU"
+class ContentsExtractor:
+    def __init__(self, api_key: str = GEMINI_API_KEY):
         if genai is None or types is None:
+            raise ImportError("google.genai is not installed or environment not set up.")
+        self.client = genai.Client(api_key=api_key)
+        self.model = "gemini-2.0-flash"
+    @staticmethod
+    def extract_first_pages(pdf_path: str, num_pages: int = 10) -> str:
         """
+        Reads up to `num_pages` from pdf_path, returns combined text.
         """
+        try:
+            doc = fitz.open(pdf_path)
+            total_pages = doc.page_count
+            pages_to_read = min(total_pages, num_pages)
+            text_list = []
+            for i in range(pages_to_read):
+                page_text = doc[i].get_text()
+                text_list.append(page_text)
+            doc.close()
+            return "\n".join(text_list)
+        except Exception as e:
+            logger.error(f"[ContentsExtractor] Could not open or read PDF: {e}")
+            return ""
+    def extract_contents(self, text: str) -> str:
+        """
+        Send the text to Gemini. Return raw LLM output, presumably JSON with subtopic pages.
+        """
+        if not text.strip():
+            return "{}"
         prompt = f"""
+        You have the first pages of an A-Level Mathematics specification.
+        Identify the subtopics under '2 Subject content and assessment information', especially:
+         - "Paper 1 and Paper 2: Pure Mathematics"
+         - "Paper 3: Statistics and Mechanics"
+        Return a JSON of the form:
+        {{
+          "Paper 1 and Paper 2: Pure Mathematics": [start_page, end_page],
+          "Paper 3: Statistics and Mechanics": [start_page, end_page]
+        }}
+        Where pages are 1-based.
+        No extra text. Only JSON.
+        TEXT:
+        {text}
         """
         try:
+            response = self.client.models.generate_content(
+                model=self.model,
                 contents=[prompt],
                 config=types.GenerateContentConfig(temperature=0.0)
             )
+            return response.text.strip() if (response and response.text) else "{}"
         except Exception as e:
+            logger.error(f"[ContentsExtractor] LLM error: {e}")
+            return "{}"
+# -------------------------------------------------------------------
+# 2) Helper to create a PDF subset from specific pages
+# -------------------------------------------------------------------
+def create_subset_pdf(pdf_bytes: bytes, page_indices: List[int]) -> bytes:
+    """
+    Return a new PDF containing only the pages in `page_indices` (0-based).
+    If empty, returns original.
+    """
+    if not page_indices:
+        return pdf_bytes
+    doc = fitz.open(stream=pdf_bytes, filetype="pdf")
+    new_doc = fitz.open()
+    for p in sorted(set(page_indices)):
+        if 0 <= p < doc.page_count:
+            new_doc.insert_pdf(doc, from_page=p, to_page=p)
+        else:
+            logger.warning(f"Page index {p} out of range.")
+    out_bytes = new_doc.tobytes()
+    new_doc.close()
+    doc.close()
+    return out_bytes
 # -------------------------------------------------------------------
+# 3) Gemini-based table classification and description
 # -------------------------------------------------------------------
 def call_gemini_for_table_classification(image_data: bytes) -> str:
     if genai is None or types is None:
+        logger.warning("Gemini not available. Return NO_TABLE.")
         return "NO_TABLE"
+    prompt = """Is this image a 2-col table, 3-col table, or not a table? Return 'TWO_COLUMN','THREE_COLUMN','NO_TABLE'."""
     try:
+        client = genai.Client(api_key=GEMINI_API_KEY)
         response = client.models.generate_content(
             model="gemini-2.0-flash",
             contents=[
                         {
                             "inline_data": {
                                 "mime_type": "image/jpeg",
+                                "data": base64.b64encode(image_data).decode("utf-8")
                             }
                         }
                     ]
             ],
             config=types.GenerateContentConfig(temperature=0.0)
         )
+        out = response.text.strip().upper() if (response and response.text) else "NO_TABLE"
+        if "THREE" in out:
             return "THREE_COLUMN"
+        elif "TWO" in out:
             return "TWO_COLUMN"
         else:
             return "NO_TABLE"
     except Exception as e:
+        logger.error(f"[call_gemini_for_table_classification] error: {e}")
         return "NO_TABLE"
 def call_gemini_for_image_description(image_data: bytes) -> str:
     if genai is None or types is None:
+        logger.warning("Gemini not available. Return fallback desc.")
         return "Image description unavailable"
+    prompt_text = """Short 20-word max summary if not a table. If it's an MCQ, mention 'MCQ: ...'."""
     try:
+        client = genai.Client(api_key=GEMINI_API_KEY)
         response = client.models.generate_content(
             model="gemini-2.0-flash",
             contents=[
                         {
                             "inline_data": {
                                 "mime_type": "image/jpeg",
+                                "data": base64.b64encode(image_data).decode("utf-8")
                             }
                         }
                     ]
             ],
             config=types.GenerateContentConfig(temperature=0.0)
         )
+        return response.text.strip() if (response and response.text) else "Image description unavailable"
     except Exception as e:
+        logger.error(f"[call_gemini_for_image_description] error: {e}")
         return "Image description unavailable"
 # -------------------------------------------------------------------
+# 4) LocalImageWriter that removes all text from final .md
 # -------------------------------------------------------------------
 class LocalImageWriter:
     """
+    - Receives images from doc_analyze
+    - Classifies them as table or no_table
+    - Replaces single table lines with row/cell references
+    - Output MD has only lines referencing images
     """
     def __init__(self, output_folder: str):
         self.output_folder = output_folder
         self._img_count += 1
         local_filename = f"img_{self._img_count}.png"
         local_path = os.path.join(self.images_dir, local_filename)
         with open(local_path, "wb") as f:
             f.write(data)
     def post_process(self, key: str, md_content: str) -> str:
         # 1) Table classification
+        with concurrent.futures.ThreadPoolExecutor(max_workers=len(self.descriptions)) as exe:
             fut_map = {
+                exe.submit(call_gemini_for_table_classification, info["data"]): p
                 for p, info in self.descriptions.items()
             }
             for fut in concurrent.futures.as_completed(fut_map):
                     classification = fut.result()
                     self.descriptions[path]['table_classification'] = classification
                 except Exception as e:
+                    logger.error(f"Classification error for {path}: {e}")
                     self.descriptions[path]['table_classification'] = "NO_TABLE"
+        # 2) If NO_TABLE => short description
+        with concurrent.futures.ThreadPoolExecutor(max_workers=len(self.descriptions)) as exe:
             fut_map2 = {}
             for p, info in self.descriptions.items():
                 if info['table_classification'] == "NO_TABLE":
+                    fut = exe.submit(call_gemini_for_image_description, info["data"])
                     fut_map2[fut] = p
             for fut in concurrent.futures.as_completed(fut_map2):
                     desc = fut.result()
                     self.descriptions[path]['final_alt'] = desc
                 except Exception as e:
+                    logger.error(f"Desc error for {path}: {e}")
                     self.descriptions[path]['final_alt'] = "Image description unavailable"
+        # 3) If 2-col or 3-col => "HAS TO BE PROCESSED"
         for p, info in self.descriptions.items():
             cls = info['table_classification']
             if cls == "TWO_COLUMN":
         # 4) Replace placeholders
         for p, info in self.descriptions.items():
+            old_tag = f"![]({key}{p})"
+            new_tag = f"![{info['final_alt']}]({info['relative_path']})"
+            md_content = md_content.replace(old_tag, new_tag)
+        # 5) For "HAS TO BE PROCESSED" => run TableExtractor => row/cell references
         md_content = self._process_table_images_in_markdown(md_content)
+        # 6) Keep only lines referencing images
         final_lines = []
         for line in md_content.split("\n"):
+            line = line.strip()
+            if re.match(r"^!\[.*\]\(.*\)$", line):
+                final_lines.append(line)
+        return "\n".join(final_lines)
     def _process_table_images_in_markdown(self, md_content: str) -> str:
         pattern = r"!\[HAS TO BE PROCESSED - (two|three) column table\]\(([^)]+)\)"
                 os.makedirs(out_folder, exist_ok=True)
                 extractor.save_extracted_cells(abs_image_path, row_boxes, out_folder)
                 snippet_lines = ["**Extracted table cells:**"]
                 for i, row in enumerate(row_boxes):
                     row_dir = os.path.join(out_folder, f"row_{i}")
                 new_snippet = "\n".join(snippet_lines)
                 old_line = f"![HAS TO BE PROCESSED - {col_type} column table]({image_path})"
                 md_content = md_content.replace(old_line, new_snippet)
             except Exception as e:
                 logger.error(f"Error processing table image {image_path}: {e}")
 # -------------------------------------------------------------------
+# 5) Final Pipeline
 # -------------------------------------------------------------------
+class MineruPipelineForSubtopics:
     """
+    1) Extract ~10 pages to parse contents with Gemini
+    2) Identify subtopic pages for 'Paper 1 and Paper 2: Pure Mathematics' and 'Paper 3: Statistics and Mechanics'
+    3) Create subset PDF with those pages
+    4) doc_analyze => only images => final MD with table references
     """
     def __init__(self, output_folder: str):
         self.output_folder = output_folder
         self.table_enable = False
         self.language = "en"
+        self.contents_extractor = ContentsExtractor(api_key=GEMINI_API_KEY)
     def cleanup_gpu(self):
         try:
             torch.cuda.empty_cache()
             logger.info("GPU memory cleaned up.")
         except Exception as e:
+            logger.error(f"Cleanup GPU error: {e}")
     def process(self, pdf_path: str) -> str:
         logger.info(f"Processing PDF: {pdf_path}")
         try:
+            # Step 1) parse first pages => subtopics
+            first_text = self.contents_extractor.extract_first_pages(pdf_path, num_pages=10)
+            raw_json = self.contents_extractor.extract_contents(first_text)
+            logger.info(f"[ContentsExtraction] raw LLM output: {raw_json}")
+            try:
+                subtopics_dict = json.loads(raw_json)
+            except json.JSONDecodeError:
+                logger.warning("Gemini did not return valid JSON. We'll parse entire doc.")
+                subtopics_dict = {}
+            # Step 2) gather pages from subtopics
+            # We expect keys like "Paper 1 and Paper 2: Pure Mathematics", "Paper 3: Statistics and Mechanics"
+            # If the LLM is correct, we'll get e.g. { "Paper 1 and Paper 2: Pure Mathematics": [11, 29], "Paper 3: Statistics and Mechanics": [30, 38] }
+            pages_1_2 = []
+            pages_3 = []
+            if "Paper 1 and Paper 2: Pure Mathematics" in subtopics_dict:
+                rng = subtopics_dict["Paper 1 and Paper 2: Pure Mathematics"]
+                if len(rng) == 2:
+                    for p in range(rng[0], rng[1] + 1):
+                        pages_1_2.append(p)
+            if "Paper 3: Statistics and Mechanics" in subtopics_dict:
+                rng = subtopics_dict["Paper 3: Statistics and Mechanics"]
+                if len(rng) == 2:
+                    for p in range(rng[0], rng[1] + 1):
+                        pages_3.append(p)
+            all_subtopic_pages = pages_1_2 + pages_3
+            if not all_subtopic_pages:
+                logger.warning("No subtopic pages found. We'll do entire doc.")
+                subset_pdf_bytes = open(pdf_path, "rb").read()
             else:
+                # Convert to 0-based
+                doc = fitz.open(pdf_path)
+                max_page = doc.page_count
                 doc.close()
                 zero_based = []
+                for p in all_subtopic_pages:
                     z = p - 1
+                    if 0 <= z < max_page:
                         zero_based.append(z)
                 zero_based = sorted(set(zero_based))
+                logger.info(f"Final subtopic pages (0-based): {zero_based}")
+                # If empty => entire doc
+                if not zero_based:
+                    subset_pdf_bytes = open(pdf_path, "rb").read()
                 else:
+                    original_bytes = open(pdf_path, "rb").read()
+                    subset_pdf_bytes = create_subset_pdf(original_bytes, zero_based)
+            # Step 3) doc_analyze => images => final MD
             dataset = PymuDocDataset(subset_pdf_bytes)
             inference = doc_analyze(
                 dataset,
             )
             logger.info("doc_analyze complete. Extracting images...")
+            writer = LocalImageWriter(self.output_folder)
+            pipe_result = inference.pipe_ocr_mode(writer, lang=self.language)
             md_content = pipe_result.get_markdown("local-unique-prefix/")
+            final_markdown = writer.post_process("local-unique-prefix/", md_content)
             md_path = os.path.join(self.output_folder, "final_output.md")
             with open(md_path, "w", encoding="utf-8") as f:
                 f.write(final_markdown)
             logger.info(f"Markdown saved to: {md_path}")
             return final_markdown
         finally:
             self.cleanup_gpu()
 # -------------------------------------------------------------------
 # Example usage
 # -------------------------------------------------------------------
 if __name__ == "__main__":
     input_pdf = "/home/user/app/input_output/a-level-pearson-mathematics-specification.pdf"
+    output_dir = "/home/user/app/input_output/outputed"
+    processor = MineruPipelineForSubtopics(output_folder=output_dir)
     final_md = processor.process(input_pdf)
+    print("\n===== FINAL .MD =====\n")
+    # print(final_md)