SkyNait commited on
Commit
1c9a4f5
·
1 Parent(s): 1805880
input_output/output/images/img_28.png CHANGED
topic_extraction.py CHANGED
@@ -5,27 +5,19 @@ import gc
5
  import json
6
  import logging
7
  import fitz # PyMuPDF (pip install pymupdf)
 
8
  import base64
9
  import concurrent.futures
10
- from io import BytesIO
11
  from typing import List, Dict, Any
12
 
13
- # Attempt to import google.genai
14
- try:
15
- from google import genai
16
- from google.genai import types
17
- except ImportError:
18
- genai = None
19
- types = None
20
-
21
  import torch
22
  import cv2
23
 
24
- # Magic PDF pipeline
25
  from magic_pdf.data.dataset import PymuDocDataset
26
  from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
27
 
28
- # Your TableExtractor from topic_extraction_upgrade (or similar)
29
  from table_row_extraction import TableExtractor
30
 
31
  logging.basicConfig(level=logging.INFO)
@@ -34,130 +26,113 @@ logger.setLevel(logging.INFO)
34
 
35
 
36
  # -------------------------------------------------------------------
37
- # Helper: create a subset PDF with only desired pages
38
  # -------------------------------------------------------------------
39
- def create_subset_pdf(original_pdf_bytes: bytes, page_indices: List[int]) -> bytes:
40
- """
41
- Using PyMuPDF, create a new PDF that contains only the pages in page_indices (0-based).
42
- Return the resulting PDF as bytes.
43
- """
44
- if not page_indices:
45
- return original_pdf_bytes # If empty, just return original
46
-
47
- doc = fitz.open(stream=original_pdf_bytes, filetype="pdf")
48
- new_doc = fitz.open() # empty PDF to insert pages into
49
-
50
- sorted_pages = sorted(set(page_indices))
51
- for p in sorted_pages:
52
- if 0 <= p < doc.page_count:
53
- new_doc.insert_pdf(doc, from_page=p, to_page=p)
54
- else:
55
- logger.warning(f"Page index {p} is out of range, skipping.")
56
-
57
- subset_bytes = new_doc.tobytes()
58
- new_doc.close()
59
- doc.close()
60
- return subset_bytes
61
-
62
 
63
- # -------------------------------------------------------------------
64
- # Gemini-based subtopic extraction
65
- # -------------------------------------------------------------------
66
- class GeminiTopicExtractor:
67
- """
68
- Uses Gemini to parse the PDF text, looking specifically for
69
- "2 Subject content and assessment information" and subtopics with pages.
70
- """
71
- def __init__(self, api_key: str = None):
72
- self.api_key = api_key or os.getenv("GEMINI_API_KEY", "AIzaSyDtoakpXa2pjJwcQB6TJ5QaXHNSA5JxcrU")
73
- if not self.api_key:
74
- raise ValueError("Gemini API key not found in environment or constructor.")
75
 
 
 
76
  if genai is None or types is None:
77
- logger.warning("google.genai is not installed. Subtopic extraction won't work.")
 
 
78
 
79
- def extract_subtopics(self, pdf_path: str) -> Dict[str, Any]:
 
80
  """
81
- 1) Read entire PDF text
82
- 2) Ask Gemini for JSON structure like:
83
- {
84
- "2 Subject content and assessment information": {
85
- "Paper 1 and Paper 2: Pure Mathematics": [11, 29],
86
- "Paper 3: Statistics and Mechanics": [30, 42]
87
- }
88
- }
89
- 3) Return parsed JSON
90
  """
91
- text_content = self._read_entire_pdf(pdf_path)
92
- if not text_content.strip():
93
- logger.warning("No text extracted from PDF. Returning empty JSON.")
94
- return {}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
 
96
  prompt = f"""
97
- You are given the text of a specification PDF.
98
- Identify the '2 Subject content and assessment information' topic.
99
- Under that topic, identify subtopics (like 'Paper 1 and Paper 2: Pure Mathematics', etc.)
100
- and their page ranges (1-based) from the text.
101
- Return JSON only, with structure:
102
- {{
103
- "2 Subject content and assessment information": {{
104
- "Paper 1 and Paper 2: Pure Mathematics": [start_page, end_page],
105
- "Paper 3: Statistics and Mechanics": [start_page, end_page]
106
- }}
107
- }}
108
- No extra explanation, just JSON.
109
- TEXT:
110
- {text_content}
111
  """
112
 
113
  try:
114
- client = genai.Client(api_key=self.api_key)
115
- response = client.models.generate_content(
116
- model="gemini-2.0-flash",
117
  contents=[prompt],
118
  config=types.GenerateContentConfig(temperature=0.0)
119
  )
120
- raw_text = response.text.strip() if response and response.text else "{}"
121
- # Clean up any triple backticks
122
- cleaned = raw_text.replace("```json", "").replace("```", "")
123
- data = json.loads(cleaned)
124
- return data
125
  except Exception as e:
126
- logger.error(f"Error from Gemini subtopic extraction: {e}")
127
- return {}
128
 
129
- def _read_entire_pdf(self, pdf_path: str) -> str:
130
- """
131
- Return the entire PDF text by concatenating all pages.
132
- """
133
- text_parts = []
134
- try:
135
- doc = fitz.open(pdf_path)
136
- for p in range(doc.page_count):
137
- page_text = doc.load_page(p).get_text()
138
- text_parts.append(page_text)
139
- doc.close()
140
- except Exception as e:
141
- logger.error(f"Could not open/read PDF: {e}")
142
- return "\n".join(text_parts)
 
 
 
 
 
 
 
 
 
143
 
144
 
145
  # -------------------------------------------------------------------
146
- # Gemini-based table classification (Mineru style)
147
  # -------------------------------------------------------------------
148
  def call_gemini_for_table_classification(image_data: bytes) -> str:
149
  if genai is None or types is None:
150
- logger.warning("Gemini not available. Returning NO_TABLE.")
151
  return "NO_TABLE"
152
 
153
- prompt = """You are given an image from an exam specification. Determine if it shows:
154
- - 'TWO_COLUMN' (2 col table),
155
- - 'THREE_COLUMN' (3 col table),
156
- - 'NO_TABLE' otherwise.
157
- Return only that label as entire response."""
158
-
159
  try:
160
- client = genai.Client(api_key=os.getenv("GEMINI_API_KEY", "AIzaSyDtoakpXa2pjJwcQB6TJ5QaXHNSA5JxcrU"))
161
  response = client.models.generate_content(
162
  model="gemini-2.0-flash",
163
  contents=[
@@ -167,7 +142,7 @@ Return only that label as entire response."""
167
  {
168
  "inline_data": {
169
  "mime_type": "image/jpeg",
170
- "data": base64.b64encode(image_data).decode('utf-8')
171
  }
172
  }
173
  ]
@@ -175,34 +150,25 @@ Return only that label as entire response."""
175
  ],
176
  config=types.GenerateContentConfig(temperature=0.0)
177
  )
178
- classification = response.text.strip() if (response and response.text) else "NO_TABLE"
179
- classification = classification.upper()
180
- if "THREE" in classification:
181
  return "THREE_COLUMN"
182
- elif "TWO" in classification:
183
  return "TWO_COLUMN"
184
  else:
185
  return "NO_TABLE"
186
-
187
  except Exception as e:
188
- logger.error(f"Table classification error: {e}")
189
  return "NO_TABLE"
190
 
191
-
192
- # -------------------------------------------------------------------
193
- # Gemini-based image description (Mineru style)
194
- # -------------------------------------------------------------------
195
  def call_gemini_for_image_description(image_data: bytes) -> str:
196
  if genai is None or types is None:
197
- logger.warning("Gemini not available. Returning fallback desc.")
198
  return "Image description unavailable"
199
 
200
- prompt_text = """This image is from an exam specification.
201
- No text data needed, just a short 20-word max summary if no table is detected.
202
- If it’s an MCQ, mention 'MCQ: A [...], B [...], etc.'"""
203
-
204
  try:
205
- client = genai.Client(api_key=os.getenv("GEMINI_API_KEY", "AIzaSyDtoakpXa2pjJwcQB6TJ5QaXHNSA5JxcrU"))
206
  response = client.models.generate_content(
207
  model="gemini-2.0-flash",
208
  contents=[
@@ -212,7 +178,7 @@ If it’s an MCQ, mention 'MCQ: A [...], B [...], etc.'"""
212
  {
213
  "inline_data": {
214
  "mime_type": "image/jpeg",
215
- "data": base64.b64encode(image_data).decode('utf-8')
216
  }
217
  }
218
  ]
@@ -220,20 +186,21 @@ If it’s an MCQ, mention 'MCQ: A [...], B [...], etc.'"""
220
  ],
221
  config=types.GenerateContentConfig(temperature=0.0)
222
  )
223
- return response.text.strip() if response and response.text else "Image description unavailable"
224
-
225
  except Exception as e:
226
- logger.error(f"Gemini image description error: {e}")
227
  return "Image description unavailable"
228
 
229
 
230
  # -------------------------------------------------------------------
231
- # The LocalImageWriter that does table extraction
232
  # -------------------------------------------------------------------
233
  class LocalImageWriter:
234
  """
235
- Only writes images, does table classification, then modifies final MD
236
- so that we keep only table references. We do not keep any text lines.
 
 
237
  """
238
  def __init__(self, output_folder: str):
239
  self.output_folder = output_folder
@@ -249,7 +216,6 @@ class LocalImageWriter:
249
  self._img_count += 1
250
  local_filename = f"img_{self._img_count}.png"
251
  local_path = os.path.join(self.images_dir, local_filename)
252
-
253
  with open(local_path, "wb") as f:
254
  f.write(data)
255
 
@@ -263,9 +229,9 @@ class LocalImageWriter:
263
 
264
  def post_process(self, key: str, md_content: str) -> str:
265
  # 1) Table classification
266
- with concurrent.futures.ThreadPoolExecutor(max_workers=len(self.descriptions)) as executor:
267
  fut_map = {
268
- executor.submit(call_gemini_for_table_classification, info["data"]): p
269
  for p, info in self.descriptions.items()
270
  }
271
  for fut in concurrent.futures.as_completed(fut_map):
@@ -274,15 +240,15 @@ class LocalImageWriter:
274
  classification = fut.result()
275
  self.descriptions[path]['table_classification'] = classification
276
  except Exception as e:
277
- logger.error(f"[Gemini Table Classification Error for {path}]: {e}")
278
  self.descriptions[path]['table_classification'] = "NO_TABLE"
279
 
280
- # 2) If NO_TABLE => normal gemini-based description
281
- with concurrent.futures.ThreadPoolExecutor(max_workers=len(self.descriptions)) as executor:
282
  fut_map2 = {}
283
  for p, info in self.descriptions.items():
284
  if info['table_classification'] == "NO_TABLE":
285
- fut = executor.submit(call_gemini_for_image_description, info['data'])
286
  fut_map2[fut] = p
287
 
288
  for fut in concurrent.futures.as_completed(fut_map2):
@@ -291,10 +257,10 @@ class LocalImageWriter:
291
  desc = fut.result()
292
  self.descriptions[path]['final_alt'] = desc
293
  except Exception as e:
294
- logger.error(f"[Gemini Desc Error for {path}]: {e}")
295
  self.descriptions[path]['final_alt'] = "Image description unavailable"
296
 
297
- # 3) If 2/3-col => "HAS TO BE PROCESSED"
298
  for p, info in self.descriptions.items():
299
  cls = info['table_classification']
300
  if cls == "TWO_COLUMN":
@@ -306,22 +272,20 @@ class LocalImageWriter:
306
 
307
  # 4) Replace placeholders
308
  for p, info in self.descriptions.items():
309
- old_md = f"![]({key}{p})"
310
- new_md = f"![{info['final_alt']}]({info['relative_path']})"
311
- md_content = md_content.replace(old_md, new_md)
312
 
313
- # 5) For "HAS TO BE PROCESSED" => run TableExtractor => replace single line with row/cell lines
314
  md_content = self._process_table_images_in_markdown(md_content)
315
 
316
- # 6) **Remove all text** => keep only lines that are image references
317
  final_lines = []
318
  for line in md_content.split("\n"):
319
- # We only keep lines that start with "!" or have "!["
320
- # (i.e. lines referencing images)
321
- if re.match(r"^\!\[.*\]\(.*\)", line.strip()):
322
- final_lines.append(line.strip())
323
- new_md = "\n".join(final_lines)
324
- return new_md
325
 
326
  def _process_table_images_in_markdown(self, md_content: str) -> str:
327
  pattern = r"!\[HAS TO BE PROCESSED - (two|three) column table\]\(([^)]+)\)"
@@ -350,7 +314,6 @@ class LocalImageWriter:
350
  os.makedirs(out_folder, exist_ok=True)
351
  extractor.save_extracted_cells(abs_image_path, row_boxes, out_folder)
352
 
353
- # Build snippet
354
  snippet_lines = ["**Extracted table cells:**"]
355
  for i, row in enumerate(row_boxes):
356
  row_dir = os.path.join(out_folder, f"row_{i}")
@@ -363,7 +326,6 @@ class LocalImageWriter:
363
  new_snippet = "\n".join(snippet_lines)
364
  old_line = f"![HAS TO BE PROCESSED - {col_type} column table]({image_path})"
365
  md_content = md_content.replace(old_line, new_snippet)
366
-
367
  except Exception as e:
368
  logger.error(f"Error processing table image {image_path}: {e}")
369
 
@@ -371,14 +333,14 @@ class LocalImageWriter:
371
 
372
 
373
  # -------------------------------------------------------------------
374
- # Final Processor
375
  # -------------------------------------------------------------------
376
- class MineruNoTextProcessor:
377
  """
378
- 1) Use Gemini to find '2 Subject content...' subtopics + pages
379
- 2) Subset PDF to those pages
380
- 3) doc_analyze => only images => classify tables => produce markdown with table rows only
381
- 4) No textual data in final markdown
382
  """
383
  def __init__(self, output_folder: str):
384
  self.output_folder = output_folder
@@ -389,7 +351,7 @@ class MineruNoTextProcessor:
389
  self.table_enable = False
390
  self.language = "en"
391
 
392
- self.subtopic_extractor = GeminiTopicExtractor()
393
 
394
  def cleanup_gpu(self):
395
  try:
@@ -397,54 +359,64 @@ class MineruNoTextProcessor:
397
  torch.cuda.empty_cache()
398
  logger.info("GPU memory cleaned up.")
399
  except Exception as e:
400
- logger.error(f"Error during GPU cleanup: {e}")
401
 
402
  def process(self, pdf_path: str) -> str:
403
- """
404
- 1) Extract subtopics JSON from the PDF
405
- 2) Flatten page ranges for subtopics
406
- 3) Subset PDF
407
- 4) doc_analyze => images => produce MD with only table lines
408
- 5) Return final MD
409
- """
410
  logger.info(f"Processing PDF: {pdf_path}")
411
  try:
412
- # 1) Extract subtopics
413
- data = self.subtopic_extractor.extract_subtopics(pdf_path)
414
- if not data or "2 Subject content and assessment information" not in data:
415
- logger.warning("Gemini did not return '2 Subject content...' or data is empty.")
416
- page_indices = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
417
  else:
418
- # 2) Flatten pages
419
- page_indices = self._collect_page_indices(data["2 Subject content and assessment information"])
420
-
421
- with open(pdf_path, "rb") as f:
422
- original_pdf_bytes = f.read()
423
-
424
- # If no pages found => entire doc
425
- if page_indices:
426
- # Convert from 1-based => 0-based
427
- doc = fitz.open(stream=original_pdf_bytes, filetype="pdf")
428
- max_p = doc.page_count
429
  doc.close()
430
 
431
  zero_based = []
432
- for p in page_indices:
433
  z = p - 1
434
- if 0 <= z < max_p:
435
  zero_based.append(z)
436
  zero_based = sorted(set(zero_based))
 
437
 
438
- if zero_based:
439
- logger.info(f"Subtopic pages (0-based): {zero_based}")
440
- subset_pdf_bytes = create_subset_pdf(original_pdf_bytes, zero_based)
441
  else:
442
- logger.warning("No valid subtopic pages, using entire doc.")
443
- subset_pdf_bytes = original_pdf_bytes
444
- else:
445
- subset_pdf_bytes = original_pdf_bytes
446
 
447
- # 3) doc_analyze with subset
448
  dataset = PymuDocDataset(subset_pdf_bytes)
449
  inference = doc_analyze(
450
  dataset,
@@ -456,51 +428,30 @@ class MineruNoTextProcessor:
456
  )
457
  logger.info("doc_analyze complete. Extracting images...")
458
 
459
- # 4) Only images => table classification => final MD
460
- image_writer = LocalImageWriter(self.output_folder)
461
- pipe_result = inference.pipe_ocr_mode(image_writer, lang=self.language)
462
 
463
  md_content = pipe_result.get_markdown("local-unique-prefix/")
464
- final_markdown = image_writer.post_process("local-unique-prefix/", md_content)
465
 
466
- # 5) Save final
467
  md_path = os.path.join(self.output_folder, "final_output.md")
468
  with open(md_path, "w", encoding="utf-8") as f:
469
  f.write(final_markdown)
470
 
471
  logger.info(f"Markdown saved to: {md_path}")
472
  return final_markdown
473
-
474
  finally:
475
  self.cleanup_gpu()
476
 
477
- def _collect_page_indices(self, subtopic_dict: Dict[str, List[int]]) -> List[int]:
478
- """
479
- Given something like:
480
- {
481
- "Paper 1 and Paper 2: Pure Mathematics": [11, 29],
482
- "Paper 3: Statistics and Mechanics": [30, 42]
483
- }
484
- Return [11..29, 30..42] => a flattened list of pages
485
- """
486
- pages = []
487
- for _, rng in subtopic_dict.items():
488
- if isinstance(rng, list) and len(rng) == 2:
489
- start_p, end_p = rng
490
- # add all pages from start to end (inclusive)
491
- for p in range(start_p, end_p + 1):
492
- pages.append(p)
493
- return pages
494
-
495
 
496
  # -------------------------------------------------------------------
497
  # Example usage
498
  # -------------------------------------------------------------------
499
  if __name__ == "__main__":
500
  input_pdf = "/home/user/app/input_output/a-level-pearson-mathematics-specification.pdf"
501
- output_dir = "/home/user/app/input_output/output"
502
 
503
- processor = MineruNoTextProcessor(output_folder=output_dir)
504
  final_md = processor.process(input_pdf)
505
- # print("\n=== FINAL MARKDOWN (TABLE ROWS ONLY) ===\n")
506
- # print(final_md)
 
5
  import json
6
  import logging
7
  import fitz # PyMuPDF (pip install pymupdf)
8
+ import requests
9
  import base64
10
  import concurrent.futures
 
11
  from typing import List, Dict, Any
12
 
 
 
 
 
 
 
 
 
13
  import torch
14
  import cv2
15
 
16
+ # magic-pdf
17
  from magic_pdf.data.dataset import PymuDocDataset
18
  from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
19
 
20
+ # TableExtractor from your "topic_extraction_upgrade.py"
21
  from table_row_extraction import TableExtractor
22
 
23
  logging.basicConfig(level=logging.INFO)
 
26
 
27
 
28
  # -------------------------------------------------------------------
29
+ # 1) "ContentsExtractor" approach (similar to contents_extractor_v2)
30
  # -------------------------------------------------------------------
31
+ try:
32
+ from google import genai
33
+ from google.genai import types
34
+ except ImportError:
35
+ genai = None
36
+ types = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
+ GEMINI_API_KEY = "AIzaSyDtoakpXa2pjJwcQB6TJ5QaXHNSA5JxcrU"
 
 
 
 
 
 
 
 
 
 
 
39
 
40
+ class ContentsExtractor:
41
+ def __init__(self, api_key: str = GEMINI_API_KEY):
42
  if genai is None or types is None:
43
+ raise ImportError("google.genai is not installed or environment not set up.")
44
+ self.client = genai.Client(api_key=api_key)
45
+ self.model = "gemini-2.0-flash"
46
 
47
+ @staticmethod
48
+ def extract_first_pages(pdf_path: str, num_pages: int = 10) -> str:
49
  """
50
+ Reads up to `num_pages` from pdf_path, returns combined text.
 
 
 
 
 
 
 
 
51
  """
52
+ try:
53
+ doc = fitz.open(pdf_path)
54
+ total_pages = doc.page_count
55
+ pages_to_read = min(total_pages, num_pages)
56
+ text_list = []
57
+ for i in range(pages_to_read):
58
+ page_text = doc[i].get_text()
59
+ text_list.append(page_text)
60
+ doc.close()
61
+ return "\n".join(text_list)
62
+ except Exception as e:
63
+ logger.error(f"[ContentsExtractor] Could not open or read PDF: {e}")
64
+ return ""
65
+
66
+ def extract_contents(self, text: str) -> str:
67
+ """
68
+ Send the text to Gemini. Return raw LLM output, presumably JSON with subtopic pages.
69
+ """
70
+ if not text.strip():
71
+ return "{}"
72
 
73
  prompt = f"""
74
+ You have the first pages of an A-Level Mathematics specification.
75
+ Identify the subtopics under '2 Subject content and assessment information', especially:
76
+ - "Paper 1 and Paper 2: Pure Mathematics"
77
+ - "Paper 3: Statistics and Mechanics"
78
+ Return a JSON of the form:
79
+ {{
80
+ "Paper 1 and Paper 2: Pure Mathematics": [start_page, end_page],
81
+ "Paper 3: Statistics and Mechanics": [start_page, end_page]
82
+ }}
83
+ Where pages are 1-based.
84
+ No extra text. Only JSON.
85
+ TEXT:
86
+ {text}
 
87
  """
88
 
89
  try:
90
+ response = self.client.models.generate_content(
91
+ model=self.model,
 
92
  contents=[prompt],
93
  config=types.GenerateContentConfig(temperature=0.0)
94
  )
95
+ return response.text.strip() if (response and response.text) else "{}"
 
 
 
 
96
  except Exception as e:
97
+ logger.error(f"[ContentsExtractor] LLM error: {e}")
98
+ return "{}"
99
 
100
+
101
+ # -------------------------------------------------------------------
102
+ # 2) Helper to create a PDF subset from specific pages
103
+ # -------------------------------------------------------------------
104
+ def create_subset_pdf(pdf_bytes: bytes, page_indices: List[int]) -> bytes:
105
+ """
106
+ Return a new PDF containing only the pages in `page_indices` (0-based).
107
+ If empty, returns original.
108
+ """
109
+ if not page_indices:
110
+ return pdf_bytes
111
+
112
+ doc = fitz.open(stream=pdf_bytes, filetype="pdf")
113
+ new_doc = fitz.open()
114
+ for p in sorted(set(page_indices)):
115
+ if 0 <= p < doc.page_count:
116
+ new_doc.insert_pdf(doc, from_page=p, to_page=p)
117
+ else:
118
+ logger.warning(f"Page index {p} out of range.")
119
+ out_bytes = new_doc.tobytes()
120
+ new_doc.close()
121
+ doc.close()
122
+ return out_bytes
123
 
124
 
125
  # -------------------------------------------------------------------
126
+ # 3) Gemini-based table classification and description
127
  # -------------------------------------------------------------------
128
  def call_gemini_for_table_classification(image_data: bytes) -> str:
129
  if genai is None or types is None:
130
+ logger.warning("Gemini not available. Return NO_TABLE.")
131
  return "NO_TABLE"
132
 
133
+ prompt = """Is this image a 2-col table, 3-col table, or not a table? Return 'TWO_COLUMN','THREE_COLUMN','NO_TABLE'."""
 
 
 
 
 
134
  try:
135
+ client = genai.Client(api_key=GEMINI_API_KEY)
136
  response = client.models.generate_content(
137
  model="gemini-2.0-flash",
138
  contents=[
 
142
  {
143
  "inline_data": {
144
  "mime_type": "image/jpeg",
145
+ "data": base64.b64encode(image_data).decode("utf-8")
146
  }
147
  }
148
  ]
 
150
  ],
151
  config=types.GenerateContentConfig(temperature=0.0)
152
  )
153
+ out = response.text.strip().upper() if (response and response.text) else "NO_TABLE"
154
+ if "THREE" in out:
 
155
  return "THREE_COLUMN"
156
+ elif "TWO" in out:
157
  return "TWO_COLUMN"
158
  else:
159
  return "NO_TABLE"
 
160
  except Exception as e:
161
+ logger.error(f"[call_gemini_for_table_classification] error: {e}")
162
  return "NO_TABLE"
163
 
 
 
 
 
164
  def call_gemini_for_image_description(image_data: bytes) -> str:
165
  if genai is None or types is None:
166
+ logger.warning("Gemini not available. Return fallback desc.")
167
  return "Image description unavailable"
168
 
169
+ prompt_text = """Short 20-word max summary if not a table. If it's an MCQ, mention 'MCQ: ...'."""
 
 
 
170
  try:
171
+ client = genai.Client(api_key=GEMINI_API_KEY)
172
  response = client.models.generate_content(
173
  model="gemini-2.0-flash",
174
  contents=[
 
178
  {
179
  "inline_data": {
180
  "mime_type": "image/jpeg",
181
+ "data": base64.b64encode(image_data).decode("utf-8")
182
  }
183
  }
184
  ]
 
186
  ],
187
  config=types.GenerateContentConfig(temperature=0.0)
188
  )
189
+ return response.text.strip() if (response and response.text) else "Image description unavailable"
 
190
  except Exception as e:
191
+ logger.error(f"[call_gemini_for_image_description] error: {e}")
192
  return "Image description unavailable"
193
 
194
 
195
  # -------------------------------------------------------------------
196
+ # 4) LocalImageWriter that removes all text from final .md
197
  # -------------------------------------------------------------------
198
  class LocalImageWriter:
199
  """
200
+ - Receives images from doc_analyze
201
+ - Classifies them as table or no_table
202
+ - Replaces single table lines with row/cell references
203
+ - Output MD has only lines referencing images
204
  """
205
  def __init__(self, output_folder: str):
206
  self.output_folder = output_folder
 
216
  self._img_count += 1
217
  local_filename = f"img_{self._img_count}.png"
218
  local_path = os.path.join(self.images_dir, local_filename)
 
219
  with open(local_path, "wb") as f:
220
  f.write(data)
221
 
 
229
 
230
  def post_process(self, key: str, md_content: str) -> str:
231
  # 1) Table classification
232
+ with concurrent.futures.ThreadPoolExecutor(max_workers=len(self.descriptions)) as exe:
233
  fut_map = {
234
+ exe.submit(call_gemini_for_table_classification, info["data"]): p
235
  for p, info in self.descriptions.items()
236
  }
237
  for fut in concurrent.futures.as_completed(fut_map):
 
240
  classification = fut.result()
241
  self.descriptions[path]['table_classification'] = classification
242
  except Exception as e:
243
+ logger.error(f"Classification error for {path}: {e}")
244
  self.descriptions[path]['table_classification'] = "NO_TABLE"
245
 
246
+ # 2) If NO_TABLE => short description
247
+ with concurrent.futures.ThreadPoolExecutor(max_workers=len(self.descriptions)) as exe:
248
  fut_map2 = {}
249
  for p, info in self.descriptions.items():
250
  if info['table_classification'] == "NO_TABLE":
251
+ fut = exe.submit(call_gemini_for_image_description, info["data"])
252
  fut_map2[fut] = p
253
 
254
  for fut in concurrent.futures.as_completed(fut_map2):
 
257
  desc = fut.result()
258
  self.descriptions[path]['final_alt'] = desc
259
  except Exception as e:
260
+ logger.error(f"Desc error for {path}: {e}")
261
  self.descriptions[path]['final_alt'] = "Image description unavailable"
262
 
263
+ # 3) If 2-col or 3-col => "HAS TO BE PROCESSED"
264
  for p, info in self.descriptions.items():
265
  cls = info['table_classification']
266
  if cls == "TWO_COLUMN":
 
272
 
273
  # 4) Replace placeholders
274
  for p, info in self.descriptions.items():
275
+ old_tag = f"![]({key}{p})"
276
+ new_tag = f"![{info['final_alt']}]({info['relative_path']})"
277
+ md_content = md_content.replace(old_tag, new_tag)
278
 
279
+ # 5) For "HAS TO BE PROCESSED" => run TableExtractor => row/cell references
280
  md_content = self._process_table_images_in_markdown(md_content)
281
 
282
+ # 6) Keep only lines referencing images
283
  final_lines = []
284
  for line in md_content.split("\n"):
285
+ line = line.strip()
286
+ if re.match(r"^!\[.*\]\(.*\)$", line):
287
+ final_lines.append(line)
288
+ return "\n".join(final_lines)
 
 
289
 
290
  def _process_table_images_in_markdown(self, md_content: str) -> str:
291
  pattern = r"!\[HAS TO BE PROCESSED - (two|three) column table\]\(([^)]+)\)"
 
314
  os.makedirs(out_folder, exist_ok=True)
315
  extractor.save_extracted_cells(abs_image_path, row_boxes, out_folder)
316
 
 
317
  snippet_lines = ["**Extracted table cells:**"]
318
  for i, row in enumerate(row_boxes):
319
  row_dir = os.path.join(out_folder, f"row_{i}")
 
326
  new_snippet = "\n".join(snippet_lines)
327
  old_line = f"![HAS TO BE PROCESSED - {col_type} column table]({image_path})"
328
  md_content = md_content.replace(old_line, new_snippet)
 
329
  except Exception as e:
330
  logger.error(f"Error processing table image {image_path}: {e}")
331
 
 
333
 
334
 
335
  # -------------------------------------------------------------------
336
+ # 5) Final Pipeline
337
  # -------------------------------------------------------------------
338
+ class MineruPipelineForSubtopics:
339
  """
340
+ 1) Extract ~10 pages to parse contents with Gemini
341
+ 2) Identify subtopic pages for 'Paper 1 and Paper 2: Pure Mathematics' and 'Paper 3: Statistics and Mechanics'
342
+ 3) Create subset PDF with those pages
343
+ 4) doc_analyze => only images => final MD with table references
344
  """
345
  def __init__(self, output_folder: str):
346
  self.output_folder = output_folder
 
351
  self.table_enable = False
352
  self.language = "en"
353
 
354
+ self.contents_extractor = ContentsExtractor(api_key=GEMINI_API_KEY)
355
 
356
  def cleanup_gpu(self):
357
  try:
 
359
  torch.cuda.empty_cache()
360
  logger.info("GPU memory cleaned up.")
361
  except Exception as e:
362
+ logger.error(f"Cleanup GPU error: {e}")
363
 
364
  def process(self, pdf_path: str) -> str:
 
 
 
 
 
 
 
365
  logger.info(f"Processing PDF: {pdf_path}")
366
  try:
367
+ # Step 1) parse first pages => subtopics
368
+ first_text = self.contents_extractor.extract_first_pages(pdf_path, num_pages=10)
369
+ raw_json = self.contents_extractor.extract_contents(first_text)
370
+ logger.info(f"[ContentsExtraction] raw LLM output: {raw_json}")
371
+ try:
372
+ subtopics_dict = json.loads(raw_json)
373
+ except json.JSONDecodeError:
374
+ logger.warning("Gemini did not return valid JSON. We'll parse entire doc.")
375
+ subtopics_dict = {}
376
+
377
+ # Step 2) gather pages from subtopics
378
+ # We expect keys like "Paper 1 and Paper 2: Pure Mathematics", "Paper 3: Statistics and Mechanics"
379
+ # If the LLM is correct, we'll get e.g. { "Paper 1 and Paper 2: Pure Mathematics": [11, 29], "Paper 3: Statistics and Mechanics": [30, 38] }
380
+ pages_1_2 = []
381
+ pages_3 = []
382
+ if "Paper 1 and Paper 2: Pure Mathematics" in subtopics_dict:
383
+ rng = subtopics_dict["Paper 1 and Paper 2: Pure Mathematics"]
384
+ if len(rng) == 2:
385
+ for p in range(rng[0], rng[1] + 1):
386
+ pages_1_2.append(p)
387
+
388
+ if "Paper 3: Statistics and Mechanics" in subtopics_dict:
389
+ rng = subtopics_dict["Paper 3: Statistics and Mechanics"]
390
+ if len(rng) == 2:
391
+ for p in range(rng[0], rng[1] + 1):
392
+ pages_3.append(p)
393
+
394
+ all_subtopic_pages = pages_1_2 + pages_3
395
+ if not all_subtopic_pages:
396
+ logger.warning("No subtopic pages found. We'll do entire doc.")
397
+ subset_pdf_bytes = open(pdf_path, "rb").read()
398
  else:
399
+ # Convert to 0-based
400
+ doc = fitz.open(pdf_path)
401
+ max_page = doc.page_count
 
 
 
 
 
 
 
 
402
  doc.close()
403
 
404
  zero_based = []
405
+ for p in all_subtopic_pages:
406
  z = p - 1
407
+ if 0 <= z < max_page:
408
  zero_based.append(z)
409
  zero_based = sorted(set(zero_based))
410
+ logger.info(f"Final subtopic pages (0-based): {zero_based}")
411
 
412
+ # If empty => entire doc
413
+ if not zero_based:
414
+ subset_pdf_bytes = open(pdf_path, "rb").read()
415
  else:
416
+ original_bytes = open(pdf_path, "rb").read()
417
+ subset_pdf_bytes = create_subset_pdf(original_bytes, zero_based)
 
 
418
 
419
+ # Step 3) doc_analyze => images => final MD
420
  dataset = PymuDocDataset(subset_pdf_bytes)
421
  inference = doc_analyze(
422
  dataset,
 
428
  )
429
  logger.info("doc_analyze complete. Extracting images...")
430
 
431
+ writer = LocalImageWriter(self.output_folder)
432
+ pipe_result = inference.pipe_ocr_mode(writer, lang=self.language)
 
433
 
434
  md_content = pipe_result.get_markdown("local-unique-prefix/")
435
+ final_markdown = writer.post_process("local-unique-prefix/", md_content)
436
 
 
437
  md_path = os.path.join(self.output_folder, "final_output.md")
438
  with open(md_path, "w", encoding="utf-8") as f:
439
  f.write(final_markdown)
440
 
441
  logger.info(f"Markdown saved to: {md_path}")
442
  return final_markdown
 
443
  finally:
444
  self.cleanup_gpu()
445
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
446
 
447
  # -------------------------------------------------------------------
448
  # Example usage
449
  # -------------------------------------------------------------------
450
  if __name__ == "__main__":
451
  input_pdf = "/home/user/app/input_output/a-level-pearson-mathematics-specification.pdf"
452
+ output_dir = "/home/user/app/input_output/outputed"
453
 
454
+ processor = MineruPipelineForSubtopics(output_folder=output_dir)
455
  final_md = processor.process(input_pdf)
456
+ print("\n===== FINAL .MD =====\n")
457
+ # print(final_md)