Spaces:

Duplicated from opendatalab/MinerU

Quextro
/

MinerU

Paused

App Files Files Community

SkyNait commited on Feb 26

Commit

b7ce5d6

·

1 Parent(s): 2ee4ef1

test_pdf

Files changed (16) hide show

input_output/outputed/final_output.md +12 -2
input_output/outputed/images/img_34.png_rows/row_0/col_0.png +0 -0
input_output/outputed/images/img_34.png_rows/row_0/col_1.png +0 -0
input_output/outputed/images/img_34.png_rows/row_1/col_0.png +0 -0
input_output/outputed/images/img_34.png_rows/row_1/col_1.png +0 -0
input_output/outputed/images/img_34.png_rows/row_2/col_0.png +0 -0
input_output/outputed/images/img_34.png_rows/row_2/col_1.png +0 -0
input_output/outputed/images/img_35.png_rows/row_0/col_0.png +0 -0
input_output/outputed/images/img_35.png_rows/row_0/col_1.png +0 -0
input_output/outputed/images/img_35.png_rows/row_1/col_0.png +0 -0
input_output/outputed/images/img_35.png_rows/row_1/col_1.png +0 -0
input_output/outputed/images/img_35.png_rows/row_2/col_0.png +0 -0
input_output/outputed/images/img_35.png_rows/row_3/col_0.png +0 -0
input_output/outputed/images/img_35.png_rows/row_4/col_0.png +0 -0
input_output/outputed/images/img_35.png_rows/row_4/col_1.png +0 -0
topic_extraction.py +302 -282

input_output/outputed/final_output.md CHANGED Viewed

@@ -1,4 +1,4 @@
-![Two origami pinecones, one brown and one tan, are displayed against a light purple background.](images/img_1.png)
 ![Row 0 Col 0](images/img_2.png_rows/row_0/col_0.png)
 ![Row 1 Col 0](images/img_2.png_rows/row_1/col_0.png)
 ![Row 2 Col 0](images/img_2.png_rows/row_2/col_0.png)
@@ -234,10 +234,20 @@
 ![Row 4 Col 0](images/img_33.png_rows/row_4/col_0.png)
 ![Row 4 Col 1](images/img_33.png_rows/row_4/col_1.png)
 ![Row 0 Col 0](images/img_34.png_rows/row_0/col_0.png)
 ![Row 1 Col 0](images/img_34.png_rows/row_1/col_0.png)
 ![Row 2 Col 0](images/img_34.png_rows/row_2/col_0.png)
 ![Row 3 Col 0](images/img_34.png_rows/row_3/col_0.png)
-![Table showing the percentage breakdown of assessment objectives (AO1, AO2, AO3) for GCE A Level Maths papers.](images/img_35.png)
 ![Row 0 Col 0](images/img_36.png_rows/row_0/col_0.png)
 ![Row 1 Col 0](images/img_36.png_rows/row_1/col_0.png)
 ![Row 2 Col 0](images/img_36.png_rows/row_2/col_0.png)

+![Two origami pinecones, one brown and one tan, on a purple background.](images/img_1.png)
 ![Row 0 Col 0](images/img_2.png_rows/row_0/col_0.png)
 ![Row 1 Col 0](images/img_2.png_rows/row_1/col_0.png)
 ![Row 2 Col 0](images/img_2.png_rows/row_2/col_0.png)
 ![Row 4 Col 0](images/img_33.png_rows/row_4/col_0.png)
 ![Row 4 Col 1](images/img_33.png_rows/row_4/col_1.png)
 ![Row 0 Col 0](images/img_34.png_rows/row_0/col_0.png)
+![Row 0 Col 1](images/img_34.png_rows/row_0/col_1.png)
 ![Row 1 Col 0](images/img_34.png_rows/row_1/col_0.png)
+![Row 1 Col 1](images/img_34.png_rows/row_1/col_1.png)
 ![Row 2 Col 0](images/img_34.png_rows/row_2/col_0.png)
+![Row 2 Col 1](images/img_34.png_rows/row_2/col_1.png)
 ![Row 3 Col 0](images/img_34.png_rows/row_3/col_0.png)
+![Row 0 Col 0](images/img_35.png_rows/row_0/col_0.png)
+![Row 0 Col 1](images/img_35.png_rows/row_0/col_1.png)
+![Row 1 Col 0](images/img_35.png_rows/row_1/col_0.png)
+![Row 1 Col 1](images/img_35.png_rows/row_1/col_1.png)
+![Row 2 Col 0](images/img_35.png_rows/row_2/col_0.png)
+![Row 3 Col 0](images/img_35.png_rows/row_3/col_0.png)
+![Row 4 Col 0](images/img_35.png_rows/row_4/col_0.png)
+![Row 4 Col 1](images/img_35.png_rows/row_4/col_1.png)
 ![Row 0 Col 0](images/img_36.png_rows/row_0/col_0.png)
 ![Row 1 Col 0](images/img_36.png_rows/row_1/col_0.png)
 ![Row 2 Col 0](images/img_36.png_rows/row_2/col_0.png)

input_output/outputed/images/img_34.png_rows/row_0/col_0.png CHANGED Viewed

input_output/outputed/images/img_34.png_rows/row_0/col_1.png ADDED Viewed

input_output/outputed/images/img_34.png_rows/row_1/col_0.png CHANGED Viewed

input_output/outputed/images/img_34.png_rows/row_1/col_1.png ADDED Viewed

input_output/outputed/images/img_34.png_rows/row_2/col_0.png CHANGED Viewed

input_output/outputed/images/img_34.png_rows/row_2/col_1.png ADDED Viewed

input_output/outputed/images/img_35.png_rows/row_0/col_0.png ADDED Viewed

input_output/outputed/images/img_35.png_rows/row_0/col_1.png ADDED Viewed

input_output/outputed/images/img_35.png_rows/row_1/col_0.png ADDED Viewed

input_output/outputed/images/img_35.png_rows/row_1/col_1.png ADDED Viewed

input_output/outputed/images/img_35.png_rows/row_2/col_0.png ADDED Viewed

input_output/outputed/images/img_35.png_rows/row_3/col_0.png ADDED Viewed

input_output/outputed/images/img_35.png_rows/row_4/col_0.png ADDED Viewed

input_output/outputed/images/img_35.png_rows/row_4/col_1.png ADDED Viewed

topic_extraction.py CHANGED Viewed

@@ -5,215 +5,221 @@ import gc
 import json
 import logging
 import fitz  # PyMuPDF (pip install pymupdf)
-import requests
 import base64
 import concurrent.futures
 from typing import List, Dict, Any
 import torch
 import cv2
-# magic-pdf
 from magic_pdf.data.dataset import PymuDocDataset
 from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
-# TableExtractor from your "topic_extraction_upgrade.py"
 from table_row_extraction import TableExtractor
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
-# -------------------------------------------------------------------
-# 1) "ContentsExtractor" approach (similar to contents_extractor_v2)
-# -------------------------------------------------------------------
-try:
-    from google import genai
-    from google.genai import types
-except ImportError:
-    genai = None
-    types = None
-GEMINI_API_KEY = "AIzaSyDtoakpXa2pjJwcQB6TJ5QaXHNSA5JxcrU"
-class ContentsExtractor:
-    def __init__(self, api_key: str = GEMINI_API_KEY):
-        if genai is None or types is None:
-            raise ImportError("google.genai is not installed or environment not set up.")
-        self.client = genai.Client(api_key=api_key)
-        self.model = "gemini-2.0-flash"
-    @staticmethod
-    def extract_first_pages(pdf_path: str, num_pages: int = 10) -> str:
-        """
-        Reads up to `num_pages` from pdf_path, returns combined text.
-        """
-        try:
-            doc = fitz.open(pdf_path)
-            total_pages = doc.page_count
-            pages_to_read = min(total_pages, num_pages)
-            text_list = []
-            for i in range(pages_to_read):
-                page_text = doc[i].get_text()
-                text_list.append(page_text)
-            doc.close()
-            return "\n".join(text_list)
-        except Exception as e:
-            logger.error(f"[ContentsExtractor] Could not open or read PDF: {e}")
-            return ""
-    def extract_contents(self, text: str) -> str:
         """
-        Send the text to Gemini. Return raw LLM output, presumably JSON with subtopic pages.
         """
-        if not text.strip():
-            return "{}"
         prompt = f"""
-        You have the first pages of an A-Level Mathematics specification.
-        You will be provided with the first pages of an exam board document. Your goal is to extract
-        the main subject-related topics from the "Contents" section and structure them in a valid JSON format.
-        Instructions:
-        1. Identify the "Contents" section, which lists all topics, subtopics, and their corresponding pages.
-        2. Extract only the **highest-level, subject-related subtopics** (ignore organizational or administrative sections).
-        3. For subtopics, include the full range of pages from the first to the last subtopic.
-        4. Return the output in the following JSON format:
-            {{
-                "topic_name": [start_page, end_page]
-            }}
-        Important Notes:
-        - Ignore non-subject-related sections (e.g., "Introduction", "Exam Guidelines", "Appendices", "Assessment, Qualification at a glance").
-        - The extracted subtopics should represent major academic areas, not organizational or structural elements.
-        - Make sure that all of the pages for a subtopic are included, end page should be the start page of the topic
-            that comes next after the extracted one in contents section.
-        Examples:
-        1. Given this table of contents:
-            1 Introduction – 2
-                Why choose Edexcel A Level Mathematics? - 2
-                Supporting you in planning and implementing this qualification - 3
-                Qualification at a glance - 5
-            2 Subject content and assessment information – 7
-                Paper 1 and Paper 2: Pure Mathematics - 11
-                Paper 3: Statistics and Mechanics - 30
-                Assessment Objectives - 40
-            3 Administration and general information – 42
-                Entries - 42
-                Access arrangements, reasonable adjustments, special consideration and malpractice - 42
-                Student recruitment and progression - 45
-            Appendix 1: Formulae – 49
-            Appendix 2: Notation – 53
-            Appendix 3: Use of calculators – 59
-            Appendix 4: Assessment Objectives – 60
-            Appendix 5: The context for the development of this qualification – 62
-            Appendix 6: Transferable skills – 64
-            Appendix 7: Level 3 Extended Project qualification – 65
-            Appendix 8: Codes – 67
-            The correct output should be:
-            {{
-                "Paper 1 and Paper 2: Pure Mathematics": [11, 29],
-                "Paper 3: Statistics and Mechanics": [30, 42]
-            }}
-        2. Given this table of contents:
-            Qualification at a glance – 1
-                Assessment Objectives and weightings - 4
-            Knowledge, skills and understanding – 5
-                Theme 1: Introduction to markets and market failure - 5
-                Theme 2: The UK economy – performance and policies - 11
-                Theme 3: Business behaviour and the labour market - 21
-                Theme 4: A global perspective - 29
-            Assessment – 39
-                Assessment summary - 39
-                Assessment objectives - 41
-                Assessment overview - 42
-                Breakdown of assessment objectives - 42
-                    Synoptic assessment - 43
-                    Discount code and performance tables - 43
-                    Access arrangements, reasonable adjustments and special consideration - 44
-                    Malpractice - 45
-                    Equality Act 2010 and Pearson equality policy - 45
-                    Synoptic assessment - 46
-                    Awarding and reporting - 47
-            Other information – 49
-                Student recruitment -49
-                Prior learning and other requirements -49
-                Progression - 49
-            Appendix 1: Transferable skills – 53
-            Appendix 2: Level 3 Extended Project qualification – 55
-            Appendix 3: Quantitative skills – 59
-            Appendix 4: Codes – 61
-            Appendix 5: Index – 63
-            The correct output should be:
-            {{
-                "Theme 1: Introduction to markets and market failure": [5, 10]
-                "Theme 2: The UK economy – performance and policies": - [11, 20]
-                "Theme 3: Business behaviour and the labour market": [21, 28]
-                "Theme 4: A global perspective": [29, 38]
-            }}
-        Where pages are 1-based.
-        No extra text. Only JSON.
-        TEXT:
-        {text}
         """
         try:
-            response = self.client.models.generate_content(
-                model=self.model,
                 contents=[prompt],
-                config=types.GenerateContentConfig(temperature=0.)
             )
-            return response.text.strip() if (response and response.text) else "{}"
         except Exception as e:
-            logger.error(f"[ContentsExtractor] LLM error: {e}")
-            return "{}"
-# -------------------------------------------------------------------
-# 2) Helper to create a PDF subset from specific pages
-# -------------------------------------------------------------------
-def create_subset_pdf(pdf_bytes: bytes, page_indices: List[int]) -> bytes:
-    """
-    Return a new PDF containing only the pages in `page_indices` (0-based).
-    If empty, returns original.
-    """
-    if not page_indices:
-        return pdf_bytes
-    doc = fitz.open(stream=pdf_bytes, filetype="pdf")
-    new_doc = fitz.open()
-    for p in sorted(set(page_indices)):
-        if 0 <= p < doc.page_count:
-            new_doc.insert_pdf(doc, from_page=p, to_page=p)
-        else:
-            logger.warning(f"Page index {p} out of range.")
-    out_bytes = new_doc.tobytes()
-    new_doc.close()
-    doc.close()
-    return out_bytes
-# -------------------------------------------------------------------
-# 3) Gemini-based table classification and description
-# -------------------------------------------------------------------
 def call_gemini_for_table_classification(image_data: bytes) -> str:
-    if genai is None or types is None:
-        logger.warning("Gemini not available. Return NO_TABLE.")
-        return "NO_TABLE"
-    prompt = """Is this image a 2-col table, 3-col table, or not a table? Return 'TWO_COLUMN','THREE_COLUMN','NO_TABLE'."""
     try:
-        client = genai.Client(api_key=GEMINI_API_KEY)
         response = client.models.generate_content(
             model="gemini-2.0-flash",
             contents=[
@@ -223,7 +229,7 @@ def call_gemini_for_table_classification(image_data: bytes) -> str:
                         {
                             "inline_data": {
                                 "mime_type": "image/jpeg",
-                                "data": base64.b64encode(image_data).decode("utf-8")
                             }
                         }
                     ]
@@ -231,25 +237,41 @@ def call_gemini_for_table_classification(image_data: bytes) -> str:
             ],
             config=types.GenerateContentConfig(temperature=0.0)
         )
-        out = response.text.strip().upper() if (response and response.text) else "NO_TABLE"
-        if "THREE" in out:
             return "THREE_COLUMN"
-        elif "TWO" in out:
             return "TWO_COLUMN"
         else:
             return "NO_TABLE"
     except Exception as e:
-        logger.error(f"[call_gemini_for_table_classification] error: {e}")
         return "NO_TABLE"
 def call_gemini_for_image_description(image_data: bytes) -> str:
-    if genai is None or types is None:
-        logger.warning("Gemini not available. Return fallback desc.")
-        return "Image description unavailable"
-    prompt_text = """Short 20-word max summary if not a table. If it's an MCQ, mention 'MCQ: ...'."""
     try:
-        client = genai.Client(api_key=GEMINI_API_KEY)
         response = client.models.generate_content(
             model="gemini-2.0-flash",
             contents=[
@@ -259,7 +281,7 @@ def call_gemini_for_image_description(image_data: bytes) -> str:
                         {
                             "inline_data": {
                                 "mime_type": "image/jpeg",
-                                "data": base64.b64encode(image_data).decode("utf-8")
                             }
                         }
                     ]
@@ -267,21 +289,16 @@ def call_gemini_for_image_description(image_data: bytes) -> str:
             ],
             config=types.GenerateContentConfig(temperature=0.0)
         )
-        return response.text.strip() if (response and response.text) else "Image description unavailable"
     except Exception as e:
-        logger.error(f"[call_gemini_for_image_description] error: {e}")
         return "Image description unavailable"
-# -------------------------------------------------------------------
-# 4) LocalImageWriter that removes all text from final .md
-# -------------------------------------------------------------------
 class LocalImageWriter:
     """
-    - Receives images from doc_analyze
-    - Classifies them as table or no_table
-    - Replaces single table lines with row/cell references
-    - Output MD has only lines referencing images
     """
     def __init__(self, output_folder: str):
         self.output_folder = output_folder
@@ -297,6 +314,7 @@ class LocalImageWriter:
         self._img_count += 1
         local_filename = f"img_{self._img_count}.png"
         local_path = os.path.join(self.images_dir, local_filename)
         with open(local_path, "wb") as f:
             f.write(data)
@@ -310,9 +328,9 @@ class LocalImageWriter:
     def post_process(self, key: str, md_content: str) -> str:
         # 1) Table classification
-        with concurrent.futures.ThreadPoolExecutor(max_workers=len(self.descriptions)) as exe:
             fut_map = {
-                exe.submit(call_gemini_for_table_classification, info["data"]): p
                 for p, info in self.descriptions.items()
             }
             for fut in concurrent.futures.as_completed(fut_map):
@@ -321,15 +339,15 @@ class LocalImageWriter:
                     classification = fut.result()
                     self.descriptions[path]['table_classification'] = classification
                 except Exception as e:
-                    logger.error(f"Classification error for {path}: {e}")
                     self.descriptions[path]['table_classification'] = "NO_TABLE"
-        # 2) If NO_TABLE => short description
-        with concurrent.futures.ThreadPoolExecutor(max_workers=len(self.descriptions)) as exe:
             fut_map2 = {}
             for p, info in self.descriptions.items():
                 if info['table_classification'] == "NO_TABLE":
-                    fut = exe.submit(call_gemini_for_image_description, info["data"])
                     fut_map2[fut] = p
             for fut in concurrent.futures.as_completed(fut_map2):
@@ -338,10 +356,10 @@ class LocalImageWriter:
                     desc = fut.result()
                     self.descriptions[path]['final_alt'] = desc
                 except Exception as e:
-                    logger.error(f"Desc error for {path}: {e}")
                     self.descriptions[path]['final_alt'] = "Image description unavailable"
-        # 3) If 2-col or 3-col => "HAS TO BE PROCESSED"
         for p, info in self.descriptions.items():
             cls = info['table_classification']
             if cls == "TWO_COLUMN":
@@ -353,20 +371,22 @@ class LocalImageWriter:
         # 4) Replace placeholders
         for p, info in self.descriptions.items():
-            old_tag = f"![]({key}{p})"
-            new_tag = f"![{info['final_alt']}]({info['relative_path']})"
-            md_content = md_content.replace(old_tag, new_tag)
-        # 5) For "HAS TO BE PROCESSED" => run TableExtractor => row/cell references
         md_content = self._process_table_images_in_markdown(md_content)
-        # 6) Keep only lines referencing images
         final_lines = []
         for line in md_content.split("\n"):
-            line = line.strip()
-            if re.match(r"^!\[.*\]\(.*\)$", line):
-                final_lines.append(line)
-        return "\n".join(final_lines)
     def _process_table_images_in_markdown(self, md_content: str) -> str:
         pattern = r"!\[HAS TO BE PROCESSED - (two|three) column table\]\(([^)]+)\)"
@@ -380,14 +400,17 @@ class LocalImageWriter:
             try:
                 if col_type.lower() == 'two':
                     extractor = TableExtractor(
                         merge_two_col_rows=True,
                         enable_subtopic_merge=True,
                         subtopic_threshold=0.2
                     )
                 else:
                     extractor = TableExtractor(
                         merge_two_col_rows=False,
-                        enable_subtopic_merge=False
                     )
                 row_boxes = extractor.process_image(abs_image_path)
@@ -395,6 +418,7 @@ class LocalImageWriter:
                 os.makedirs(out_folder, exist_ok=True)
                 extractor.save_extracted_cells(abs_image_path, row_boxes, out_folder)
                 snippet_lines = ["**Extracted table cells:**"]
                 for i, row in enumerate(row_boxes):
                     row_dir = os.path.join(out_folder, f"row_{i}")
@@ -407,22 +431,13 @@ class LocalImageWriter:
                 new_snippet = "\n".join(snippet_lines)
                 old_line = f"![HAS TO BE PROCESSED - {col_type} column table]({image_path})"
                 md_content = md_content.replace(old_line, new_snippet)
             except Exception as e:
                 logger.error(f"Error processing table image {image_path}: {e}")
         return md_content
-# -------------------------------------------------------------------
-# 5) Final Pipeline
-# -------------------------------------------------------------------
-class MineruPipelineForSubtopics:
-    """
-    1) Extract ~10 pages to parse contents with Gemini
-    2) Identify subtopic pages for 'Paper 1 and Paper 2: Pure Mathematics' and 'Paper 3: Statistics and Mechanics'
-    3) Create subset PDF with those pages
-    4) doc_analyze => only images => final MD with table references
-    """
     def __init__(self, output_folder: str):
         self.output_folder = output_folder
         os.makedirs(self.output_folder, exist_ok=True)
@@ -432,7 +447,7 @@ class MineruPipelineForSubtopics:
         self.table_enable = False
         self.language = "en"
-        self.contents_extractor = ContentsExtractor(api_key=GEMINI_API_KEY)
     def cleanup_gpu(self):
         try:
@@ -440,64 +455,54 @@ class MineruPipelineForSubtopics:
             torch.cuda.empty_cache()
             logger.info("GPU memory cleaned up.")
         except Exception as e:
-            logger.error(f"Cleanup GPU error: {e}")
     def process(self, pdf_path: str) -> str:
         logger.info(f"Processing PDF: {pdf_path}")
         try:
-            # Step 1) parse first pages => subtopics
-            first_text = self.contents_extractor.extract_first_pages(pdf_path, num_pages=10)
-            raw_json = self.contents_extractor.extract_contents(first_text)
-            logger.info(f"[ContentsExtraction] raw LLM output: {raw_json}")
-            try:
-                subtopics_dict = json.loads(raw_json)
-            except json.JSONDecodeError:
-                logger.warning("Gemini did not return valid JSON. We'll parse entire doc.")
-                subtopics_dict = {}
-            # Step 2) gather pages from subtopics
-            # We expect keys like "Paper 1 and Paper 2: Pure Mathematics", "Paper 3: Statistics and Mechanics"
-            # If the LLM is correct, we'll get e.g. { "Paper 1 and Paper 2: Pure Mathematics": [11, 29], "Paper 3: Statistics and Mechanics": [30, 38] }
-            pages_1_2 = []
-            pages_3 = []
-            if "Paper 1 and Paper 2: Pure Mathematics" in subtopics_dict:
-                rng = subtopics_dict["Paper 1 and Paper 2: Pure Mathematics"]
-                if len(rng) == 2:
-                    for p in range(rng[0], rng[1] + 1):
-                        pages_1_2.append(p)
-            if "Paper 3: Statistics and Mechanics" in subtopics_dict:
-                rng = subtopics_dict["Paper 3: Statistics and Mechanics"]
-                if len(rng) == 2:
-                    for p in range(rng[0], rng[1] + 1):
-                        pages_3.append(p)
-            all_subtopic_pages = pages_1_2 + pages_3
-            if not all_subtopic_pages:
-                logger.warning("No subtopic pages found. We'll do entire doc.")
-                subset_pdf_bytes = open(pdf_path, "rb").read()
             else:
-                # Convert to 0-based
-                doc = fitz.open(pdf_path)
-                max_page = doc.page_count
                 doc.close()
                 zero_based = []
-                for p in all_subtopic_pages:
                     z = p - 1
-                    if 0 <= z < max_page:
                         zero_based.append(z)
                 zero_based = sorted(set(zero_based))
-                logger.info(f"Final subtopic pages (0-based): {zero_based}")
-                # If empty => entire doc
-                if not zero_based:
-                    subset_pdf_bytes = open(pdf_path, "rb").read()
                 else:
-                    original_bytes = open(pdf_path, "rb").read()
-                    subset_pdf_bytes = create_subset_pdf(original_bytes, zero_based)
-            # Step 3) doc_analyze => images => final MD
             dataset = PymuDocDataset(subset_pdf_bytes)
             inference = doc_analyze(
                 dataset,
@@ -509,30 +514,45 @@ class MineruPipelineForSubtopics:
             )
             logger.info("doc_analyze complete. Extracting images...")
-            writer = LocalImageWriter(self.output_folder)
-            pipe_result = inference.pipe_ocr_mode(writer, lang=self.language)
             md_content = pipe_result.get_markdown("local-unique-prefix/")
-            final_markdown = writer.post_process("local-unique-prefix/", md_content)
             md_path = os.path.join(self.output_folder, "final_output.md")
             with open(md_path, "w", encoding="utf-8") as f:
                 f.write(final_markdown)
             logger.info(f"Markdown saved to: {md_path}")
             return final_markdown
         finally:
             self.cleanup_gpu()
-# -------------------------------------------------------------------
-# Example usage
-# -------------------------------------------------------------------
 if __name__ == "__main__":
     input_pdf = "/home/user/app/input_output/a-level-pearson-mathematics-specification.pdf"
     output_dir = "/home/user/app/input_output/outputed"
-    processor = MineruPipelineForSubtopics(output_folder=output_dir)
-    final_md = processor.process(input_pdf)
-    print("\n===== FINAL .MD =====\n")
-    # print(final_md)

 import json
 import logging
 import fitz  # PyMuPDF (pip install pymupdf)
 import base64
 import concurrent.futures
+from io import BytesIO
 from typing import List, Dict, Any
+from google import genai
+from google.genai import types
 import torch
 import cv2
 from magic_pdf.data.dataset import PymuDocDataset
 from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
 from table_row_extraction import TableExtractor
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
+def create_subset_pdf(original_pdf_bytes: bytes, page_indices: List[int]) -> bytes:
+    """
+    Using PyMuPDF, create a new PDF that contains only the pages in page_indices (0-based).
+    Return the resulting PDF as bytes.
+    """
+    if not page_indices:
+        return original_pdf_bytes  # If empty, just return original
+    doc = fitz.open(stream=original_pdf_bytes, filetype="pdf")
+    new_doc = fitz.open()  # empty PDF to insert pages into
+    sorted_pages = sorted(set(page_indices))
+    for p in sorted_pages:
+        if 0 <= p < doc.page_count:
+            new_doc.insert_pdf(doc, from_page=p, to_page=p)
+        else:
+            logger.warning(f"Page index {p} is out of range, skipping.")
+    subset_bytes = new_doc.tobytes()
+    new_doc.close()
+    doc.close()
+    return subset_bytes
+class GeminiTopicExtractor:
+    """
+    Uses Gemini to parse the PDF text, looking specifically for
+    "2 Subject content and assessment information" and subtopics with pages.
+    """
+    def __init__(self, api_key: str = None):
+        self.api_key = api_key or os.getenv("GEMINI_API_KEY", "AIzaSyDtoakpXa2pjJwcQB6TJ5QaXHNSA5JxcrU")
+    def extract_subtopics(self, pdf_path: str) -> Dict[str, Any]:
         """
+        1) Read entire PDF text
+        2) Ask Gemini for JSON structure like:
+           {
+             "2 Subject content and assessment information": {
+                "Paper 1 and Paper 2: Pure Mathematics": [11, 29],
+                "Paper 3: Statistics and Mechanics": [30, 42]
+             }
+           }
+        3) Return parsed JSON
         """
+        text_content = self._read_entire_pdf(pdf_path)
+        if not text_content.strip():
+            logger.warning("No text extracted from PDF. Returning empty JSON.")
+            return {}
         prompt = f"""
+ You will be provided with the first pages of an exam board document. Your goal is to extract
+                the main subject-related topics from the "Contents" section and structure them in a valid JSON format.
+                Instructions:
+                1. Identify the "Contents" section, which lists all topics, subtopics, and their corresponding pages.
+                2. Extract only the **highest-level, subject-related subtopics** (ignore organizational or administrative sections).
+                3. For subtopics, include the full range of pages from the first to the last subtopic.
+                4. Return the output in the following JSON format:
+                    {{
+                        "topic_name": [start_page, end_page]
+                    }}
+                Important Notes:
+                - Ignore non-subject-related sections (e.g., "Introduction", "Exam Guidelines", "Appendices", "Assessment, Qualification at a glance").
+                - The extracted subtopics should represent major academic areas, not organizational or structural elements.
+                - Make sure that all of the pages for a subtopic are included, end page should be the start page of the topic
+                  that comes next after the extracted one in contents section.
+                Examples:
+                1. Given this table of contents:
+                    1 Introduction – 2
+                        Why choose Edexcel A Level Mathematics? - 2
+                        Supporting you in planning and implementing this qualification - 3
+                        Qualification at a glance - 5
+                    2 Subject content and assessment information – 7
+                        Paper 1 and Paper 2: Pure Mathematics - 11
+                        Paper 3: Statistics and Mechanics - 30
+                        Assessment Objectives - 40
+                    3 Administration and general information – 42
+                        Entries - 42
+                        Access arrangements, reasonable adjustments, special consideration and malpractice - 42
+                        Student recruitment and progression - 45
+                    Appendix 1: Formulae – 49
+                    Appendix 2: Notation – 53
+                    Appendix 3: Use of calculators – 59
+                    Appendix 4: Assessment Objectives – 60
+                    Appendix 5: The context for the development of this qualification – 62
+                    Appendix 6: Transferable skills – 64
+                    Appendix 7: Level 3 Extended Project qualification – 65
+                    Appendix 8: Codes – 67
+                   The correct output should be:
+                    {{
+                        "Paper 1 and Paper 2: Pure Mathematics": [11, 29],
+                        "Paper 3: Statistics and Mechanics": [30, 42]
+                    }}
+                2. Given this table of contents:
+                    Qualification at a glance – 1
+                        Assessment Objectives and weightings - 4
+                    Knowledge, skills and understanding – 5
+                        Theme 1: Introduction to markets and market failure - 5
+                        Theme 2: The UK economy – performance and policies - 11
+                        Theme 3: Business behaviour and the labour market - 21
+                        Theme 4: A global perspective - 29
+                    Assessment – 39
+                        Assessment summary - 39
+                        Assessment objectives - 41
+                        Assessment overview - 42
+                        Breakdown of assessment objectives - 42
+                            Synoptic assessment - 43
+                            Discount code and performance tables - 43
+                            Access arrangements, reasonable adjustments and special consideration - 44
+                            Malpractice - 45
+                            Equality Act 2010 and Pearson equality policy - 45
+                            Synoptic assessment - 46
+                            Awarding and reporting - 47
+                    Other information – 49
+                        Student recruitment -49
+                        Prior learning and other requirements -49
+                        Progression - 49
+                    Appendix 1: Transferable skills – 53
+                    Appendix 2: Level 3 Extended Project qualification – 55
+                    Appendix 3: Quantitative skills – 59
+                    Appendix 4: Codes – 61
+                    Appendix 5: Index – 63
+                   The correct output should be:
+                    {{
+                        "Theme 1: Introduction to markets and market failure": [5, 10]
+                        "Theme 2: The UK economy – performance and policies": - [11, 20]
+                        "Theme 3: Business behaviour and the labour market": [21, 28]
+                        "Theme 4: A global perspective": [29, 38]
+                    }}
+                    Now, extract topics from this text: {text_content}
         """
         try:
+            client = genai.Client(api_key=self.api_key)
+            response = client.models.generate_content(
+                model="gemini-2.0-flash",
                 contents=[prompt],
+                config=types.GenerateContentConfig(temperature=0.0)
             )
+            raw_text = response.text.strip() if response and response.text else "{}"
+            # Clean up any triple backticks
+            cleaned = raw_text.replace("```json", "").replace("```", "")
+            data = json.loads(cleaned)
+            return data
         except Exception as e:
+            logger.error(f"Error from Gemini subtopic extraction: {e}")
+            return {}
+    def _read_entire_pdf(self, pdf_path: str) -> str:
+        """
+        Return the entire PDF text by concatenating all pages.
+        """
+        text_parts = []
+        try:
+            doc = fitz.open(pdf_path)
+            for p in range(doc.page_count):
+                page_text = doc.load_page(p).get_text()
+                text_parts.append(page_text)
+            doc.close()
+        except Exception as e:
+            logger.error(f"Could not open/read PDF: {e}")
+        return "\n".join(text_parts)
 def call_gemini_for_table_classification(image_data: bytes) -> str:
+    prompt = """You are given an image. Determine if it shows a table that has exactly 2 or 3 columns.
+The three-column 'table' image include such key features:
+    - Three columns header columns
+    - Headers like 'Topics', 'Content', 'Guidelines'
+    - Numbered sections (e.g., 8.4, 9.1)
+    - Educational curriculum-style structure
+The two-column 'table' image include such key features:
+    - Two columns header columns
+    - Headers like 'Subject content' and 'Additional information'
+    - Numbered sections (e.g., 2.1, 3.4)
+    - Educational curriculum-style structure
+    - Bullet description in 'Additional information'
+If the image is a relevant table with 2 columns, respond with 'TWO_COLUMN'.
+If the image is a relevant table with 3 columns, respond with 'THREE_COLUMN'.
+If the image does not show a table at all, respond with 'NO_TABLE'.
+Return only one of these exact labels as your entire response:
+TWO_COLUMN
+THREE_COLUMN
+NO_TABLE
+"""
     try:
+        client = genai.Client(api_key=os.getenv("GEMINI_API_KEY", "AIzaSyDtoakpXa2pjJwcQB6TJ5QaXHNSA5JxcrU"))
         response = client.models.generate_content(
             model="gemini-2.0-flash",
             contents=[
                         {
                             "inline_data": {
                                 "mime_type": "image/jpeg",
+                                "data": base64.b64encode(image_data).decode('utf-8')
                             }
                         }
                     ]
             ],
             config=types.GenerateContentConfig(temperature=0.0)
         )
+        classification = response.text.strip() if (response and response.text) else "NO_TABLE"
+        classification = classification.upper()
+        if "THREE" in classification:
             return "THREE_COLUMN"
+        elif "TWO" in classification:
             return "TWO_COLUMN"
         else:
             return "NO_TABLE"
     except Exception as e:
+        logger.error(f"Table classification error: {e}")
         return "NO_TABLE"
+# -------------------------------------------------------------------
+# Gemini-based image description (Mineru style)
+# -------------------------------------------------------------------
 def call_gemini_for_image_description(image_data: bytes) -> str:
+    prompt_text = """The provided image is a part of a question paper or markscheme.
+Extract all the necessary information from the image to be able to identify the question.
+To identify the question, we only need the following: question number and question part.
+Don't include redundant information.
+For example, if image contains text like: "Q1 Part A Answer: Life on earth was created by diety..."
+you should return just "Q1 Part A Mark Scheme"
+If there is no text on this image, return the description of the image. 20 words max.
+If there are not enough data, consider information from the surrounding context.
+Additionally, if the image contains a truncated part, you must describe it and mark as a
+part of some another image that goes before or after current image.
+If the image is of a multiple-choice question’s options, then modify your answer by appending
+'MCQ: A [option] B [option] C [option] D [option]' (replacing [option] with the actual options).
+Otherwise, follow the above instructions strictly.
+"""
     try:
+        client = genai.Client(api_key=os.getenv("GEMINI_API_KEY", "AIzaSyDtoakpXa2pjJwcQB6TJ5QaXHNSA5JxcrU"))
         response = client.models.generate_content(
             model="gemini-2.0-flash",
             contents=[
                         {
                             "inline_data": {
                                 "mime_type": "image/jpeg",
+                                "data": base64.b64encode(image_data).decode('utf-8')
                             }
                         }
                     ]
             ],
             config=types.GenerateContentConfig(temperature=0.0)
         )
+        return response.text.strip() if response and response.text else "Image description unavailable"
     except Exception as e:
+        logger.error(f"Gemini image description error: {e}")
         return "Image description unavailable"
 class LocalImageWriter:
     """
+    Only writes images, does table classification, then modifies final MD
+    so that we keep only table references. We do not keep any text lines.
     """
     def __init__(self, output_folder: str):
         self.output_folder = output_folder
         self._img_count += 1
         local_filename = f"img_{self._img_count}.png"
         local_path = os.path.join(self.images_dir, local_filename)
         with open(local_path, "wb") as f:
             f.write(data)
     def post_process(self, key: str, md_content: str) -> str:
         # 1) Table classification
+        with concurrent.futures.ThreadPoolExecutor(max_workers=len(self.descriptions)) as executor:
             fut_map = {
+                executor.submit(call_gemini_for_table_classification, info["data"]): p
                 for p, info in self.descriptions.items()
             }
             for fut in concurrent.futures.as_completed(fut_map):
                     classification = fut.result()
                     self.descriptions[path]['table_classification'] = classification
                 except Exception as e:
+                    logger.error(f"[Gemini Table Classification Error for {path}]: {e}")
                     self.descriptions[path]['table_classification'] = "NO_TABLE"
+        # 2) If NO_TABLE => normal gemini-based description
+        with concurrent.futures.ThreadPoolExecutor(max_workers=len(self.descriptions)) as executor:
             fut_map2 = {}
             for p, info in self.descriptions.items():
                 if info['table_classification'] == "NO_TABLE":
+                    fut = executor.submit(call_gemini_for_image_description, info['data'])
                     fut_map2[fut] = p
             for fut in concurrent.futures.as_completed(fut_map2):
                     desc = fut.result()
                     self.descriptions[path]['final_alt'] = desc
                 except Exception as e:
+                    logger.error(f"[Gemini Desc Error for {path}]: {e}")
                     self.descriptions[path]['final_alt'] = "Image description unavailable"
+        # 3) If 2/3-col => "HAS TO BE PROCESSED"
         for p, info in self.descriptions.items():
             cls = info['table_classification']
             if cls == "TWO_COLUMN":
         # 4) Replace placeholders
         for p, info in self.descriptions.items():
+            old_md = f"![]({key}{p})"
+            new_md = f"![{info['final_alt']}]({info['relative_path']})"
+            md_content = md_content.replace(old_md, new_md)
+        # 5) For "HAS TO BE PROCESSED" => run TableExtractor => replace single line with row/cell lines
         md_content = self._process_table_images_in_markdown(md_content)
+        # 6) **Remove all text** => keep only lines that are image references
         final_lines = []
         for line in md_content.split("\n"):
+            # We only keep lines that start with "!" or have "!["
+            # (i.e. lines referencing images)
+            if re.match(r"^\!\[.*\]\(.*\)", line.strip()):
+                final_lines.append(line.strip())
+        new_md = "\n".join(final_lines)
+        return new_md
     def _process_table_images_in_markdown(self, md_content: str) -> str:
         pattern = r"!\[HAS TO BE PROCESSED - (two|three) column table\]\(([^)]+)\)"
             try:
                 if col_type.lower() == 'two':
                     extractor = TableExtractor(
+                        skip_header=True,
                         merge_two_col_rows=True,
                         enable_subtopic_merge=True,
                         subtopic_threshold=0.2
                     )
                 else:
                     extractor = TableExtractor(
+                        skip_header=True,
                         merge_two_col_rows=False,
+                        enable_subtopic_merge=False,
+                        subtopic_threshold=0.2
                     )
                 row_boxes = extractor.process_image(abs_image_path)
                 os.makedirs(out_folder, exist_ok=True)
                 extractor.save_extracted_cells(abs_image_path, row_boxes, out_folder)
+                # Build snippet
                 snippet_lines = ["**Extracted table cells:**"]
                 for i, row in enumerate(row_boxes):
                     row_dir = os.path.join(out_folder, f"row_{i}")
                 new_snippet = "\n".join(snippet_lines)
                 old_line = f"![HAS TO BE PROCESSED - {col_type} column table]({image_path})"
                 md_content = md_content.replace(old_line, new_snippet)
             except Exception as e:
                 logger.error(f"Error processing table image {image_path}: {e}")
         return md_content
+class MineruNoTextProcessor:
     def __init__(self, output_folder: str):
         self.output_folder = output_folder
         os.makedirs(self.output_folder, exist_ok=True)
         self.table_enable = False
         self.language = "en"
+        self.subtopic_extractor = GeminiTopicExtractor()
     def cleanup_gpu(self):
         try:
             torch.cuda.empty_cache()
             logger.info("GPU memory cleaned up.")
         except Exception as e:
+            logger.error(f"Error during GPU cleanup: {e}")
     def process(self, pdf_path: str) -> str:
+        """
+        1) Extract subtopics JSON from the PDF
+        2) Flatten page ranges for subtopics
+        3) Subset PDF
+        4) doc_analyze => images => produce MD with only table lines
+        5) Return final MD
+        """
         logger.info(f"Processing PDF: {pdf_path}")
         try:
+            # 1) Extract subtopics
+            data = self.subtopic_extractor.extract_subtopics(pdf_path)
+            if not data or "2 Subject content and assessment information" not in data:
+                logger.warning("Gemini did not return '2 Subject content...' or data is empty.")
+                page_indices = None
             else:
+                # 2) Flatten pages
+                page_indices = self._collect_page_indices(data["2 Subject content and assessment information"])
+            with open(pdf_path, "rb") as f:
+                original_pdf_bytes = f.read()
+            # If no pages found => entire doc
+            if page_indices:
+                # Convert from 1-based => 0-based
+                doc = fitz.open(stream=original_pdf_bytes, filetype="pdf")
+                max_p = doc.page_count
                 doc.close()
                 zero_based = []
+                for p in page_indices:
                     z = p - 1
+                    if 0 <= z < max_p:
                         zero_based.append(z)
                 zero_based = sorted(set(zero_based))
+                if zero_based:
+                    logger.info(f"Subtopic pages (0-based): {zero_based}")
+                    subset_pdf_bytes = create_subset_pdf(original_pdf_bytes, zero_based)
                 else:
+                    logger.warning("No valid subtopic pages, using entire doc.")
+                    subset_pdf_bytes = original_pdf_bytes
+            else:
+                subset_pdf_bytes = original_pdf_bytes
+            # 3) doc_analyze with subset
             dataset = PymuDocDataset(subset_pdf_bytes)
             inference = doc_analyze(
                 dataset,
             )
             logger.info("doc_analyze complete. Extracting images...")
+            # 4) Only images => table classification => final MD
+            image_writer = LocalImageWriter(self.output_folder)
+            pipe_result = inference.pipe_ocr_mode(image_writer, lang=self.language)
             md_content = pipe_result.get_markdown("local-unique-prefix/")
+            final_markdown = image_writer.post_process("local-unique-prefix/", md_content)
+            # 5) Save final
             md_path = os.path.join(self.output_folder, "final_output.md")
             with open(md_path, "w", encoding="utf-8") as f:
                 f.write(final_markdown)
             logger.info(f"Markdown saved to: {md_path}")
             return final_markdown
         finally:
             self.cleanup_gpu()
+    def _collect_page_indices(self, subtopic_dict: Dict[str, List[int]]) -> List[int]:
+        """
+        Given something like:
+        {
+          "Paper 1 and Paper 2: Pure Mathematics": [11, 29],
+          "Paper 3: Statistics and Mechanics": [30, 42]
+        }
+        Return [11..29, 30..42] => a flattened list of pages
+        """
+        pages = []
+        for _, rng in subtopic_dict.items():
+            if isinstance(rng, list) and len(rng) == 2:
+                start_p, end_p = rng
+                # add all pages from start to end (inclusive)
+                for p in range(start_p, end_p + 1):
+                    pages.append(p)
+        return pages
 if __name__ == "__main__":
     input_pdf = "/home/user/app/input_output/a-level-pearson-mathematics-specification.pdf"
     output_dir = "/home/user/app/input_output/outputed"
+    processor = MineruNoTextProcessor(output_folder=output_dir)
+    final_md = processor.process(input_pdf)