MinerU

Paused

App Files Files Community

SkyNait commited on Feb 27

Commit

bc4eaf5

1 Parent(s): a127a50

Correct page range handling

Browse files

Files changed (1) hide show

topic_extraction.py +8 -75

topic_extraction.py CHANGED Viewed

@@ -14,41 +14,24 @@ import torch
 import cv2
 import numpy as np
-# Attempt top-level import of google.genai
-try:
-    from google import genai
-    from google.genai import types
-except ImportError:
-    genai = None
-    types = None
-# magic-pdf imports
 from magic_pdf.data.dataset import PymuDocDataset
 from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
-# table extraction logic
 from table_row_extraction import TableExtractor
-###############################################################################
-# Logging Setup
-###############################################################################
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
-###############################################################################
-# PDF Utility Functions
-###############################################################################
 def unify_whitespace(text: str) -> str:
-    """
-    Replace runs of whitespace with a single space, strip leading/trailing, then lowercase.
-    """
     return re.sub(r"\s+", " ", text).strip().lower()
 def create_subset_pdf(original_pdf_bytes: bytes, page_indices: List[int]) -> bytes:
     """
-    Creates a new PDF (in memory) containing only pages in page_indices (0-based).
-    Raises ValueError if page_indices is empty or out of range.
     """
     if not page_indices:
         raise ValueError("No page indices provided for subset creation.")
@@ -66,9 +49,6 @@ def create_subset_pdf(original_pdf_bytes: bytes, page_indices: List[int]) -> byt
     doc.close()
     return subset_bytes
-###############################################################################
-# Searching in PDF
-###############################################################################
 def find_all_occurrences(pdf_bytes: bytes, search_text: str) -> List[int]:
     """
     Return a sorted list of 0-based pages in which `search_text` (normalized) appears,
@@ -85,46 +65,20 @@ def find_all_occurrences(pdf_bytes: bytes, search_text: str) -> List[int]:
     doc.close()
     return sorted(found)
-###############################################################################
-# Gemini LLM for Subtopic Extraction
-###############################################################################
 class GeminiTopicExtractor:
-    """
-    Extract subtopics from the PDF by reading the first `num_pages` pages, calling Gemini.
-    We expect a structure like:
-      {
-        "2 Subject content and assessment information": {
-          "Paper 1 and Paper 2: Pure Mathematics": [11, 29],
-          "Paper 3: Statistics and Mechanics": [30, 42]
-        }
-      }
-    or sometimes just a flat dict:
-      {
-        "Paper 1 and Paper 2: Pure Mathematics": [15, 33],
-        "Paper 3: Statistics and Mechanics": [34, 46]
-      }
-    We'll parse both forms.
-    """
     def __init__(self, api_key: str = None, num_pages: int = 10):
         self.api_key = api_key or os.getenv("GEMINI_API_KEY", "")
-        if not self.api_key:
-            logger.warning("No Gemini API key for subtopic extraction.")
         self.num_pages = num_pages
     def extract_subtopics(self, pdf_path: str) -> Dict[str, List[int]]:
         """
         Return a dict of subtopics => [start_page, end_page].
-        Could be empty if parsing fails or the LLM can't find subtopics.
         """
         first_pages_text = self._read_first_pages_raw(pdf_path, self.num_pages)
         if not first_pages_text.strip():
             logger.error("No text from first pages => cannot extract subtopics.")
             return {}
-        if genai is None or types is None:
-            logger.warning("google.genai not installed. Returning empty subtopics.")
-            return {}
         prompt = f"""
 You have the first pages of a PDF specification, including a table of contents.
@@ -229,15 +183,14 @@ Now, extract topics from this text:
                 return {}
             raw_json = response.text.strip()
-            # Clean up triple backticks
             cleaned = raw_json.replace("```json", "").replace("```", "")
             # Attempt to parse
             data = json.loads(cleaned)
             # data might be nested or flat
-            # if nested, e.g. {"2 Subject content": {"Paper 1...": [11,29]}}
-            # if flat, e.g. {"Paper 1...": [11,29]}
-            # We'll unify it to a single dict of subname => [start,end].
             final_dict = {}
             # If the top-level is a dict of dict
@@ -254,7 +207,6 @@ Now, extract topics from this text:
                     break
             if found_sub_dict is not None:
-                # parse found_sub_dict
                 for subk, rng in found_sub_dict.items():
                     if isinstance(rng, list) and len(rng) == 2:
                         final_dict[subk] = rng
@@ -283,12 +235,9 @@ Now, extract topics from this text:
             logger.error(f"Could not open PDF: {e}")
         return "\n".join(text_parts)
-###############################################################################
-# Concurrency for Table Classification
-###############################################################################
 def call_gemini_for_table_classification(image_data: bytes, api_key: str) -> str:
     """
-    Classify an image as TWO_COLUMN, THREE_COLUMN, or NO_TABLE using Gemini.
     """
     if not api_key:
         logger.warning("No Gemini API key => NO_TABLE.")
@@ -354,9 +303,6 @@ NO_TABLE
         logger.error(f"Gemini table classification error: {e}")
         return "NO_TABLE"
-###############################################################################
-# LocalImageWriter
-###############################################################################
 class LocalImageWriter:
     """
     Writes extracted images, then does concurrency-based table classification calls.
@@ -476,17 +422,7 @@ class LocalImageWriter:
         return md_content
-###############################################################################
-# MineruNoTextProcessor
-###############################################################################
 class MineruNoTextProcessor:
-    """
-    1) Use Gemini to get subtopics => e.g. {"Paper 1 and Paper 2: Pure Mathematics": [11,29], ...}
-    2) For each subtopic name => find real occurrence in PDF at or after (start_page-1).
-    3) offset = occurrence_page - (start_page-1). clamp offset >= 0
-    4) Flatten final pages, subset PDF, run magic-pdf => concurrency => final MD
-    5) If no subtopics found, process entire PDF as fallback.
-    """
     def __init__(self, output_folder: str, gemini_api_key: str = None):
         self.output_folder = output_folder
         os.makedirs(self.output_folder, exist_ok=True)
@@ -604,9 +540,6 @@ class MineruNoTextProcessor:
         finally:
             self.cleanup_gpu()
-###############################################################################
-# Example Main
-###############################################################################
 if __name__ == "__main__":
     input_pdf = "/home/user/app/input_output/ocr-specification-economics.pdf"
     output_dir = "/home/user/app/outputs"

 import cv2
 import numpy as np
+from google import genai
+from google.genai import types
 from magic_pdf.data.dataset import PymuDocDataset
 from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
 from table_row_extraction import TableExtractor
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
 def unify_whitespace(text: str) -> str:
     return re.sub(r"\s+", " ", text).strip().lower()
 def create_subset_pdf(original_pdf_bytes: bytes, page_indices: List[int]) -> bytes:
     """
+    Creates a new PDF (in memory) containing only pages from page_indices (0-based).
     """
     if not page_indices:
         raise ValueError("No page indices provided for subset creation.")
     doc.close()
     return subset_bytes
 def find_all_occurrences(pdf_bytes: bytes, search_text: str) -> List[int]:
     """
     Return a sorted list of 0-based pages in which `search_text` (normalized) appears,
     doc.close()
     return sorted(found)
 class GeminiTopicExtractor:
     def __init__(self, api_key: str = None, num_pages: int = 10):
         self.api_key = api_key or os.getenv("GEMINI_API_KEY", "")
         self.num_pages = num_pages
     def extract_subtopics(self, pdf_path: str) -> Dict[str, List[int]]:
         """
         Return a dict of subtopics => [start_page, end_page].
         """
         first_pages_text = self._read_first_pages_raw(pdf_path, self.num_pages)
         if not first_pages_text.strip():
             logger.error("No text from first pages => cannot extract subtopics.")
             return {}
         prompt = f"""
 You have the first pages of a PDF specification, including a table of contents.
                 return {}
             raw_json = response.text.strip()
             cleaned = raw_json.replace("```json", "").replace("```", "")
             # Attempt to parse
             data = json.loads(cleaned)
             # data might be nested or flat
+            # if nested, example {"2 Subject content": {"Paper 1...": [11,29]}}
+            # if flat, example {"Paper 1...": [11,29]}
+            # so we unify it to a single dict of subname => [start,end].
             final_dict = {}
             # If the top-level is a dict of dict
                     break
             if found_sub_dict is not None:
                 for subk, rng in found_sub_dict.items():
                     if isinstance(rng, list) and len(rng) == 2:
                         final_dict[subk] = rng
             logger.error(f"Could not open PDF: {e}")
         return "\n".join(text_parts)
 def call_gemini_for_table_classification(image_data: bytes, api_key: str) -> str:
     """
+    Classify an image as TWO_COLUMN, THREE_COLUMN, or NO_TABLE
     """
     if not api_key:
         logger.warning("No Gemini API key => NO_TABLE.")
         logger.error(f"Gemini table classification error: {e}")
         return "NO_TABLE"
 class LocalImageWriter:
     """
     Writes extracted images, then does concurrency-based table classification calls.
         return md_content
 class MineruNoTextProcessor:
     def __init__(self, output_folder: str, gemini_api_key: str = None):
         self.output_folder = output_folder
         os.makedirs(self.output_folder, exist_ok=True)
         finally:
             self.cleanup_gpu()
 if __name__ == "__main__":
     input_pdf = "/home/user/app/input_output/ocr-specification-economics.pdf"
     output_dir = "/home/user/app/outputs"