MinerU

Paused

App Files Files Community

SkyNait commited on Feb 27

Commit

da9ad0b

verified ·

1 Parent(s): ee099e1

add comments for understanding

Browse files

Files changed (1) hide show

topic_extraction.py +41 -66

topic_extraction.py CHANGED Viewed

@@ -26,16 +26,15 @@ logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
 def unify_whitespace(text: str) -> str:
     return re.sub(r"\s+", " ", text).strip().lower()
 def create_subset_pdf(original_pdf_bytes: bytes, page_indices: List[int]) -> bytes:
-    """
-    Creates a new PDF (in memory) containing only pages from page_indices (0-based).
-    """
     if not page_indices:
         raise ValueError("No page indices provided for subset creation.")
     doc = fitz.open(stream=original_pdf_bytes, filetype="pdf")
     new_doc = fitz.open()
     for p in sorted(set(page_indices)):
@@ -78,7 +77,6 @@ class GeminiTopicExtractor:
         if not first_pages_text.strip():
             logger.error("No text from first pages => cannot extract subtopics.")
             return {}
         prompt = f"""
 You have the first pages of a PDF specification, including a table of contents.
@@ -162,17 +160,20 @@ Appendix 5: Index – 63
 The correct output should be:
 {{
-    "Theme 1: Introduction to markets and market failure": [5, 10]
-    "Theme 2: The UK economy – performance and policies": [11, 20]
-    "Theme 3: Business behaviour and the labour market": [21, 28]
     "Theme 4: A global perspective": [29, 38]
 }}
 Now, extract topics from this text:
 {first_pages_text}
 """
         try:
-            client = genai.Client(api_key=self.api_key)
             response = client.models.generate_content(
                 model="gemini-2.0-flash",
                 contents=[prompt],
@@ -181,7 +182,7 @@ Now, extract topics from this text:
             if not response or not response.text:
                 logger.warning("No text from LLM => returning empty subtopics.")
                 return {}
             raw_json = response.text.strip()
             cleaned = raw_json.replace("```json", "").replace("```", "")
@@ -205,7 +206,6 @@ Now, extract topics from this text:
                     # might be the sub-sub dict
                     found_sub_dict = v
                     break
             if found_sub_dict is not None:
                 for subk, rng in found_sub_dict.items():
                     if isinstance(rng, list) and len(rng) == 2:
@@ -216,7 +216,6 @@ Now, extract topics from this text:
                 for subk, rng in data.items():
                     if isinstance(rng, list) and len(rng) == 2:
                         final_dict[subk] = rng
             return final_dict
         except Exception as e:
             logger.error(f"Gemini subtopic extraction error: {e}")
@@ -239,14 +238,7 @@ def call_gemini_for_table_classification(image_data: bytes, api_key: str) -> str
     """
     Classify an image as TWO_COLUMN, THREE_COLUMN, or NO_TABLE
     """
-    if not api_key:
-        logger.warning("No Gemini API key => NO_TABLE.")
-        return "NO_TABLE"
-    if genai is None or types is None:
-        logger.warning("google.genai not installed => NO_TABLE.")
-        return "NO_TABLE"
-    # Attempt to shrink
     try:
         arr = np.frombuffer(image_data, np.uint8)
         img = cv2.imdecode(arr, cv2.IMREAD_COLOR)
@@ -266,7 +258,7 @@ def call_gemini_for_table_classification(image_data: bytes, api_key: str) -> str
                 image_data = enc.tobytes()
     except Exception as e:
         logger.warning(f"shrink_image_to_jpeg error: {e}")
     prompt = """You are given an image. Determine if it shows a table that has exactly 2 or 3 columns.
 The three-column 'table' image include such key features:
     - Three columns header columns
@@ -287,8 +279,11 @@ TWO_COLUMN
 THREE_COLUMN
 NO_TABLE
 """
     try:
-        client = genai.Client(api_key=api_key)
         resp = client.models.generate_content(
             model="gemini-2.0-flash",
             contents=[
@@ -324,10 +319,8 @@ class LocalImageWriter:
     def __init__(self, output_folder: str, gemini_api_key: str):
         self.output_folder = output_folder
         os.makedirs(self.output_folder, exist_ok=True)
         self.images_dir = os.path.join(self.output_folder, "images")
         os.makedirs(self.images_dir, exist_ok=True)
         self.descriptions = {}
         self._img_count = 0
         self.gemini_api_key = gemini_api_key
@@ -349,11 +342,7 @@ class LocalImageWriter:
     def post_process(self, key: str, md_content: str) -> str:
         logger.info("Classifying images to detect tables (concurrent)...")
         with concurrent.futures.ThreadPoolExecutor(max_workers=6) as exe:
-            fut_map = {}
-            for p, info in self.descriptions.items():
-                fut = exe.submit(call_gemini_for_table_classification, info["data"], self.gemini_api_key)
-                fut_map[fut] = p
             for fut in concurrent.futures.as_completed(fut_map):
                 path = fut_map[fut]
                 try:
@@ -362,8 +351,7 @@ class LocalImageWriter:
                 except Exception as e:
                     logger.error(f"Table classification error: {e}")
                     self.descriptions[path]['table_classification'] = "NO_TABLE"
-        # 2) Set final alt text
         for p, info in self.descriptions.items():
             cls = info['table_classification']
             if cls == "TWO_COLUMN":
@@ -372,22 +360,21 @@ class LocalImageWriter:
                 info['final_alt'] = "HAS TO BE PROCESSED - three column table"
             else:
                 info['final_alt'] = "NO_TABLE image"
-        # 3) Replace placeholders in the Markdown
         for p, info in self.descriptions.items():
             old_md = f"![]({key}{p})"
             new_md = f"![{info['final_alt']}]({info['relative_path']})"
             md_content = md_content.replace(old_md, new_md)
-        # 4) If any table images => extract rows
         md_content = self._process_table_images_in_markdown(md_content)
-        # 5) Keep only lines that are image references
         final_lines = []
         for line in md_content.split("\n"):
             if re.match(r"^\!\[.*\]\(.*\)", line.strip()):
                 final_lines.append(line.strip())
         return "\n".join(final_lines)
     def _process_table_images_in_markdown(self, md_content: str) -> str:
@@ -395,7 +382,7 @@ class LocalImageWriter:
         matches = re.findall(pat, md_content, flags=re.IGNORECASE)
         if not matches:
             return md_content
         for (col_type, image_path) in matches:
             logger.info(f"Processing table image => {image_path}, columns={col_type}")
             abs_image_path = os.path.join(self.output_folder, image_path)
@@ -418,7 +405,7 @@ class LocalImageWriter:
                 out_folder = abs_image_path + "_rows"
                 os.makedirs(out_folder, exist_ok=True)
                 extractor.save_extracted_cells(abs_image_path, row_boxes, out_folder)
                 snippet = ["**Extracted table cells:**"]
                 for i, row in enumerate(row_boxes):
                     row_dir = os.path.join(out_folder, f"row_{i}")
@@ -427,26 +414,22 @@ class LocalImageWriter:
                         cell_path = os.path.join(row_dir, cell_file)
                         relp = os.path.relpath(cell_path, self.output_folder)
                         snippet.append(f"![Row {i} Col {j}]({relp})")
                 new_snip = "\n".join(snippet)
                 old_line = f"![HAS TO BE PROCESSED - {col_type} column table]({image_path})"
                 md_content = md_content.replace(old_line, new_snip)
             except Exception as e:
                 logger.error(f"Error processing table image {image_path}: {e}")
         return md_content
 class MineruNoTextProcessor:
     def __init__(self, output_folder: str, gemini_api_key: str = None):
         self.output_folder = output_folder
         os.makedirs(self.output_folder, exist_ok=True)
         self.layout_model = "doclayout_yolo"
         self.formula_enable = True
         self.table_enable = False
         self.language = "en"
-        # Use our new flexible approach
         self.subtopic_extractor = GeminiTopicExtractor(api_key=gemini_api_key, num_pages=10)
         self.gemini_api_key = gemini_api_key or os.getenv("GEMINI_API_KEY", "")
@@ -459,19 +442,18 @@ class MineruNoTextProcessor:
             logger.error(f"Error during GPU cleanup: {e}")
     def process(self, pdf_path: str) -> str:
-        logger.info(f"Processing PDF: {pdf_path}")
         try:
-            # 1) Extract subtopics from Gemini
             subtopics = self.subtopic_extractor.extract_subtopics(pdf_path)
             logger.info(f"Gemini returned subtopics: {subtopics}")
-            # 2) Read entire PDF
             with open(pdf_path, "rb") as f:
                 pdf_bytes = f.read()
             doc = fitz.open(stream=pdf_bytes, filetype="pdf")
             total_pages = doc.page_count
             doc.close()
             final_pages = set()
             if not subtopics:
                 logger.warning("No subtopics found. We'll process the entire PDF as fallback.")
@@ -490,7 +472,6 @@ class MineruNoTextProcessor:
                     # find occurrences
                     occs = find_all_occurrences(pdf_bytes, subname)
                     logger.info(f"Occurrences of subtopic '{subname}': {occs}")
                     doc_start_0 = start_p - 1
                     chosen_page = None
                     for p in occs:
@@ -505,27 +486,25 @@ class MineruNoTextProcessor:
                         else:
                             chosen_page = 0
                             logger.warning(f"No occurrences for '{subname}'. Using page 0.")
                     raw_offset = chosen_page - doc_start_0
                     offset = max(0, raw_offset)
                     logger.info(f"Subtopic '{subname}': doc_start={start_p}, chosen_page={chosen_page}, raw_offset={raw_offset}, offset={offset}")
                     s0 = (start_p - 1) + offset
                     e0 = (end_p - 1) + offset
                     s0 = max(0, min(total_pages - 1, s0))
                     e0 = max(0, min(total_pages - 1, e0))
                     for pp in range(s0, e0 + 1):
                         final_pages.add(pp)
-            # 3) If final_pages is empty => fallback entire PDF
             if not final_pages:
                 logger.warning("No valid pages after offset. We'll process entire PDF.")
                 final_pages = set(range(total_pages))
             logger.info(f"Processing pages (0-based): {sorted(final_pages)}")
             subset_pdf_bytes = create_subset_pdf(pdf_bytes, sorted(final_pages))
-            # 4) doc_analyze => concurrency => final MD
             dataset = PymuDocDataset(subset_pdf_bytes)
             inference = doc_analyze(
                 dataset,
@@ -535,22 +514,19 @@ class MineruNoTextProcessor:
                 formula_enable=self.formula_enable,
                 table_enable=self.table_enable
             )
-            logger.info("doc_analyze complete. Extracting images...")
             writer = LocalImageWriter(self.output_folder, self.gemini_api_key)
             pipe_result = inference.pipe_ocr_mode(writer, lang=self.language)
             md_content = pipe_result.get_markdown("local-unique-prefix/")
             final_markdown = writer.post_process("local-unique-prefix/", md_content)
-            # 5) Save
             out_path = os.path.join(self.output_folder, "final_output.md")
             with open(out_path, "w", encoding="utf-8") as f:
                 f.write(final_markdown)
             logger.info(f"Markdown saved to: {out_path}")
             return final_markdown
         finally:
             self.cleanup_gpu()
@@ -558,7 +534,6 @@ if __name__ == "__main__":
     input_pdf = "/home/user/app/input_output/ocr-specification-economics.pdf"
     output_dir = "/home/user/app/outputs"
     gemini_key = os.getenv("GEMINI_API_KEY", "AIzaSyDtoakpXa2pjJwcQB6TJ5QaXHNSA5JxcrU")
     try:
         processor = MineruNoTextProcessor(output_folder=output_dir, gemini_api_key=gemini_key)
         md_output = processor.process(input_pdf)

 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
+_GEMINI_CLIENT = None
+#to store and reuse a single Gemini client instance instead of reinitializing
 def unify_whitespace(text: str) -> str:
     return re.sub(r"\s+", " ", text).strip().lower()
 def create_subset_pdf(original_pdf_bytes: bytes, page_indices: List[int]) -> bytes:
     if not page_indices:
         raise ValueError("No page indices provided for subset creation.")
     doc = fitz.open(stream=original_pdf_bytes, filetype="pdf")
     new_doc = fitz.open()
     for p in sorted(set(page_indices)):
         if not first_pages_text.strip():
             logger.error("No text from first pages => cannot extract subtopics.")
             return {}
         prompt = f"""
 You have the first pages of a PDF specification, including a table of contents.
 The correct output should be:
 {{
+    "Theme 1: Introduction to markets and market failure": [5, 10],
+    "Theme 2: The UK economy – performance and policies": [11, 20],
+    "Theme 3: Business behaviour and the labour market": [21, 28],
     "Theme 4: A global perspective": [29, 38]
 }}
 Now, extract topics from this text:
 {first_pages_text}
 """
+        global _GEMINI_CLIENT
+        if _GEMINI_CLIENT is None:
+            _GEMINI_CLIENT = genai.Client(api_key=self.api_key)
+        client = _GEMINI_CLIENT
         try:
             response = client.models.generate_content(
                 model="gemini-2.0-flash",
                 contents=[prompt],
             if not response or not response.text:
                 logger.warning("No text from LLM => returning empty subtopics.")
                 return {}
             raw_json = response.text.strip()
             cleaned = raw_json.replace("```json", "").replace("```", "")
                     # might be the sub-sub dict
                     found_sub_dict = v
                     break
             if found_sub_dict is not None:
                 for subk, rng in found_sub_dict.items():
                     if isinstance(rng, list) and len(rng) == 2:
                 for subk, rng in data.items():
                     if isinstance(rng, list) and len(rng) == 2:
                         final_dict[subk] = rng
             return final_dict
         except Exception as e:
             logger.error(f"Gemini subtopic extraction error: {e}")
     """
     Classify an image as TWO_COLUMN, THREE_COLUMN, or NO_TABLE
     """
+    #shrink image to reduce size
     try:
         arr = np.frombuffer(image_data, np.uint8)
         img = cv2.imdecode(arr, cv2.IMREAD_COLOR)
                 image_data = enc.tobytes()
     except Exception as e:
         logger.warning(f"shrink_image_to_jpeg error: {e}")
     prompt = """You are given an image. Determine if it shows a table that has exactly 2 or 3 columns.
 The three-column 'table' image include such key features:
     - Three columns header columns
 THREE_COLUMN
 NO_TABLE
 """
+    global _GEMINI_CLIENT
+    if _GEMINI_CLIENT is None:
+        _GEMINI_CLIENT = genai.Client(api_key=api_key)
+    client = _GEMINI_CLIENT
     try:
         resp = client.models.generate_content(
             model="gemini-2.0-flash",
             contents=[
     def __init__(self, output_folder: str, gemini_api_key: str):
         self.output_folder = output_folder
         os.makedirs(self.output_folder, exist_ok=True)
         self.images_dir = os.path.join(self.output_folder, "images")
         os.makedirs(self.images_dir, exist_ok=True)
         self.descriptions = {}
         self._img_count = 0
         self.gemini_api_key = gemini_api_key
     def post_process(self, key: str, md_content: str) -> str:
         logger.info("Classifying images to detect tables (concurrent)...")
         with concurrent.futures.ThreadPoolExecutor(max_workers=6) as exe:
+            fut_map = {exe.submit(call_gemini_for_table_classification, info["data"], self.gemini_api_key): p for p, info in self.descriptions.items()}
             for fut in concurrent.futures.as_completed(fut_map):
                 path = fut_map[fut]
                 try:
                 except Exception as e:
                     logger.error(f"Table classification error: {e}")
                     self.descriptions[path]['table_classification'] = "NO_TABLE"
         for p, info in self.descriptions.items():
             cls = info['table_classification']
             if cls == "TWO_COLUMN":
                 info['final_alt'] = "HAS TO BE PROCESSED - three column table"
             else:
                 info['final_alt'] = "NO_TABLE image"
+        #replace placeholders in the Markdown
         for p, info in self.descriptions.items():
             old_md = f"![]({key}{p})"
             new_md = f"![{info['final_alt']}]({info['relative_path']})"
             md_content = md_content.replace(old_md, new_md)
+        # IF any table images => extract rows
         md_content = self._process_table_images_in_markdown(md_content)
+        # Keep only lines that are image references
         final_lines = []
         for line in md_content.split("\n"):
             if re.match(r"^\!\[.*\]\(.*\)", line.strip()):
                 final_lines.append(line.strip())
         return "\n".join(final_lines)
     def _process_table_images_in_markdown(self, md_content: str) -> str:
         matches = re.findall(pat, md_content, flags=re.IGNORECASE)
         if not matches:
             return md_content
         for (col_type, image_path) in matches:
             logger.info(f"Processing table image => {image_path}, columns={col_type}")
             abs_image_path = os.path.join(self.output_folder, image_path)
                 out_folder = abs_image_path + "_rows"
                 os.makedirs(out_folder, exist_ok=True)
                 extractor.save_extracted_cells(abs_image_path, row_boxes, out_folder)
                 snippet = ["**Extracted table cells:**"]
                 for i, row in enumerate(row_boxes):
                     row_dir = os.path.join(out_folder, f"row_{i}")
                         cell_path = os.path.join(row_dir, cell_file)
                         relp = os.path.relpath(cell_path, self.output_folder)
                         snippet.append(f"![Row {i} Col {j}]({relp})")
                 new_snip = "\n".join(snippet)
                 old_line = f"![HAS TO BE PROCESSED - {col_type} column table]({image_path})"
                 md_content = md_content.replace(old_line, new_snip)
             except Exception as e:
                 logger.error(f"Error processing table image {image_path}: {e}")
         return md_content
 class MineruNoTextProcessor:
     def __init__(self, output_folder: str, gemini_api_key: str = None):
         self.output_folder = output_folder
         os.makedirs(self.output_folder, exist_ok=True)
         self.layout_model = "doclayout_yolo"
         self.formula_enable = True
         self.table_enable = False
         self.language = "en"
         self.subtopic_extractor = GeminiTopicExtractor(api_key=gemini_api_key, num_pages=10)
         self.gemini_api_key = gemini_api_key or os.getenv("GEMINI_API_KEY", "")
             logger.error(f"Error during GPU cleanup: {e}")
     def process(self, pdf_path: str) -> str:
         try:
+            #Extract subtopics from Gemini
             subtopics = self.subtopic_extractor.extract_subtopics(pdf_path)
             logger.info(f"Gemini returned subtopics: {subtopics}")
+            #read whole pdf
             with open(pdf_path, "rb") as f:
                 pdf_bytes = f.read()
             doc = fitz.open(stream=pdf_bytes, filetype="pdf")
             total_pages = doc.page_count
             doc.close()
             final_pages = set()
             if not subtopics:
                 logger.warning("No subtopics found. We'll process the entire PDF as fallback.")
                     # find occurrences
                     occs = find_all_occurrences(pdf_bytes, subname)
                     logger.info(f"Occurrences of subtopic '{subname}': {occs}")
                     doc_start_0 = start_p - 1
                     chosen_page = None
                     for p in occs:
                         else:
                             chosen_page = 0
                             logger.warning(f"No occurrences for '{subname}'. Using page 0.")
                     raw_offset = chosen_page - doc_start_0
                     offset = max(0, raw_offset)
                     logger.info(f"Subtopic '{subname}': doc_start={start_p}, chosen_page={chosen_page}, raw_offset={raw_offset}, offset={offset}")
                     s0 = (start_p - 1) + offset
                     e0 = (end_p - 1) + offset
                     s0 = max(0, min(total_pages - 1, s0))
                     e0 = max(0, min(total_pages - 1, e0))
                     for pp in range(s0, e0 + 1):
                         final_pages.add(pp)
             if not final_pages:
                 logger.warning("No valid pages after offset. We'll process entire PDF.")
                 final_pages = set(range(total_pages))
             logger.info(f"Processing pages (0-based): {sorted(final_pages)}")
             subset_pdf_bytes = create_subset_pdf(pdf_bytes, sorted(final_pages))
+            # doc_analyze => concurrency => final MD
             dataset = PymuDocDataset(subset_pdf_bytes)
             inference = doc_analyze(
                 dataset,
                 formula_enable=self.formula_enable,
                 table_enable=self.table_enable
             )
+            logger.info("doc_analyze complete. Extracting images.")
             writer = LocalImageWriter(self.output_folder, self.gemini_api_key)
             pipe_result = inference.pipe_ocr_mode(writer, lang=self.language)
             md_content = pipe_result.get_markdown("local-unique-prefix/")
             final_markdown = writer.post_process("local-unique-prefix/", md_content)
             out_path = os.path.join(self.output_folder, "final_output.md")
             with open(out_path, "w", encoding="utf-8") as f:
                 f.write(final_markdown)
             logger.info(f"Markdown saved to: {out_path}")
             return final_markdown
         finally:
             self.cleanup_gpu()
     input_pdf = "/home/user/app/input_output/ocr-specification-economics.pdf"
     output_dir = "/home/user/app/outputs"
     gemini_key = os.getenv("GEMINI_API_KEY", "AIzaSyDtoakpXa2pjJwcQB6TJ5QaXHNSA5JxcrU")
     try:
         processor = MineruNoTextProcessor(output_folder=output_dir, gemini_api_key=gemini_key)
         md_output = processor.process(input_pdf)