SkyNait commited on
Commit
b7ce5d6
·
1 Parent(s): 2ee4ef1
input_output/outputed/final_output.md CHANGED
@@ -1,4 +1,4 @@
1
- ![Two origami pinecones, one brown and one tan, are displayed against a light purple background.](images/img_1.png)
2
  ![Row 0 Col 0](images/img_2.png_rows/row_0/col_0.png)
3
  ![Row 1 Col 0](images/img_2.png_rows/row_1/col_0.png)
4
  ![Row 2 Col 0](images/img_2.png_rows/row_2/col_0.png)
@@ -234,10 +234,20 @@
234
  ![Row 4 Col 0](images/img_33.png_rows/row_4/col_0.png)
235
  ![Row 4 Col 1](images/img_33.png_rows/row_4/col_1.png)
236
  ![Row 0 Col 0](images/img_34.png_rows/row_0/col_0.png)
 
237
  ![Row 1 Col 0](images/img_34.png_rows/row_1/col_0.png)
 
238
  ![Row 2 Col 0](images/img_34.png_rows/row_2/col_0.png)
 
239
  ![Row 3 Col 0](images/img_34.png_rows/row_3/col_0.png)
240
- ![Table showing the percentage breakdown of assessment objectives (AO1, AO2, AO3) for GCE A Level Maths papers.](images/img_35.png)
 
 
 
 
 
 
 
241
  ![Row 0 Col 0](images/img_36.png_rows/row_0/col_0.png)
242
  ![Row 1 Col 0](images/img_36.png_rows/row_1/col_0.png)
243
  ![Row 2 Col 0](images/img_36.png_rows/row_2/col_0.png)
 
1
+ ![Two origami pinecones, one brown and one tan, on a purple background.](images/img_1.png)
2
  ![Row 0 Col 0](images/img_2.png_rows/row_0/col_0.png)
3
  ![Row 1 Col 0](images/img_2.png_rows/row_1/col_0.png)
4
  ![Row 2 Col 0](images/img_2.png_rows/row_2/col_0.png)
 
234
  ![Row 4 Col 0](images/img_33.png_rows/row_4/col_0.png)
235
  ![Row 4 Col 1](images/img_33.png_rows/row_4/col_1.png)
236
  ![Row 0 Col 0](images/img_34.png_rows/row_0/col_0.png)
237
+ ![Row 0 Col 1](images/img_34.png_rows/row_0/col_1.png)
238
  ![Row 1 Col 0](images/img_34.png_rows/row_1/col_0.png)
239
+ ![Row 1 Col 1](images/img_34.png_rows/row_1/col_1.png)
240
  ![Row 2 Col 0](images/img_34.png_rows/row_2/col_0.png)
241
+ ![Row 2 Col 1](images/img_34.png_rows/row_2/col_1.png)
242
  ![Row 3 Col 0](images/img_34.png_rows/row_3/col_0.png)
243
+ ![Row 0 Col 0](images/img_35.png_rows/row_0/col_0.png)
244
+ ![Row 0 Col 1](images/img_35.png_rows/row_0/col_1.png)
245
+ ![Row 1 Col 0](images/img_35.png_rows/row_1/col_0.png)
246
+ ![Row 1 Col 1](images/img_35.png_rows/row_1/col_1.png)
247
+ ![Row 2 Col 0](images/img_35.png_rows/row_2/col_0.png)
248
+ ![Row 3 Col 0](images/img_35.png_rows/row_3/col_0.png)
249
+ ![Row 4 Col 0](images/img_35.png_rows/row_4/col_0.png)
250
+ ![Row 4 Col 1](images/img_35.png_rows/row_4/col_1.png)
251
  ![Row 0 Col 0](images/img_36.png_rows/row_0/col_0.png)
252
  ![Row 1 Col 0](images/img_36.png_rows/row_1/col_0.png)
253
  ![Row 2 Col 0](images/img_36.png_rows/row_2/col_0.png)
input_output/outputed/images/img_34.png_rows/row_0/col_0.png CHANGED
input_output/outputed/images/img_34.png_rows/row_0/col_1.png ADDED
input_output/outputed/images/img_34.png_rows/row_1/col_0.png CHANGED
input_output/outputed/images/img_34.png_rows/row_1/col_1.png ADDED
input_output/outputed/images/img_34.png_rows/row_2/col_0.png CHANGED
input_output/outputed/images/img_34.png_rows/row_2/col_1.png ADDED
input_output/outputed/images/img_35.png_rows/row_0/col_0.png ADDED
input_output/outputed/images/img_35.png_rows/row_0/col_1.png ADDED
input_output/outputed/images/img_35.png_rows/row_1/col_0.png ADDED
input_output/outputed/images/img_35.png_rows/row_1/col_1.png ADDED
input_output/outputed/images/img_35.png_rows/row_2/col_0.png ADDED
input_output/outputed/images/img_35.png_rows/row_3/col_0.png ADDED
input_output/outputed/images/img_35.png_rows/row_4/col_0.png ADDED
input_output/outputed/images/img_35.png_rows/row_4/col_1.png ADDED
topic_extraction.py CHANGED
@@ -5,215 +5,221 @@ import gc
5
  import json
6
  import logging
7
  import fitz # PyMuPDF (pip install pymupdf)
8
- import requests
9
  import base64
10
  import concurrent.futures
 
11
  from typing import List, Dict, Any
12
 
 
 
13
  import torch
14
  import cv2
15
 
16
- # magic-pdf
17
  from magic_pdf.data.dataset import PymuDocDataset
18
  from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
19
 
20
- # TableExtractor from your "topic_extraction_upgrade.py"
21
  from table_row_extraction import TableExtractor
22
 
23
  logging.basicConfig(level=logging.INFO)
24
  logger = logging.getLogger(__name__)
25
  logger.setLevel(logging.INFO)
26
 
 
 
 
 
 
 
 
27
 
28
- # -------------------------------------------------------------------
29
- # 1) "ContentsExtractor" approach (similar to contents_extractor_v2)
30
- # -------------------------------------------------------------------
31
- try:
32
- from google import genai
33
- from google.genai import types
34
- except ImportError:
35
- genai = None
36
- types = None
37
-
38
- GEMINI_API_KEY = "AIzaSyDtoakpXa2pjJwcQB6TJ5QaXHNSA5JxcrU"
39
-
40
- class ContentsExtractor:
41
- def __init__(self, api_key: str = GEMINI_API_KEY):
42
- if genai is None or types is None:
43
- raise ImportError("google.genai is not installed or environment not set up.")
44
- self.client = genai.Client(api_key=api_key)
45
- self.model = "gemini-2.0-flash"
46
-
47
- @staticmethod
48
- def extract_first_pages(pdf_path: str, num_pages: int = 10) -> str:
49
- """
50
- Reads up to `num_pages` from pdf_path, returns combined text.
51
- """
52
- try:
53
- doc = fitz.open(pdf_path)
54
- total_pages = doc.page_count
55
- pages_to_read = min(total_pages, num_pages)
56
- text_list = []
57
- for i in range(pages_to_read):
58
- page_text = doc[i].get_text()
59
- text_list.append(page_text)
60
- doc.close()
61
- return "\n".join(text_list)
62
- except Exception as e:
63
- logger.error(f"[ContentsExtractor] Could not open or read PDF: {e}")
64
- return ""
65
 
66
- def extract_contents(self, text: str) -> str:
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  """
68
- Send the text to Gemini. Return raw LLM output, presumably JSON with subtopic pages.
 
 
 
 
 
 
 
 
69
  """
70
- if not text.strip():
71
- return "{}"
 
 
72
 
73
  prompt = f"""
74
- You have the first pages of an A-Level Mathematics specification.
75
- You will be provided with the first pages of an exam board document. Your goal is to extract
76
- the main subject-related topics from the "Contents" section and structure them in a valid JSON format.
77
-
78
- Instructions:
79
- 1. Identify the "Contents" section, which lists all topics, subtopics, and their corresponding pages.
80
- 2. Extract only the **highest-level, subject-related subtopics** (ignore organizational or administrative sections).
81
- 3. For subtopics, include the full range of pages from the first to the last subtopic.
82
- 4. Return the output in the following JSON format:
83
-
84
- {{
85
- "topic_name": [start_page, end_page]
86
- }}
87
-
88
- Important Notes:
89
- - Ignore non-subject-related sections (e.g., "Introduction", "Exam Guidelines", "Appendices", "Assessment, Qualification at a glance").
90
- - The extracted subtopics should represent major academic areas, not organizational or structural elements.
91
- - Make sure that all of the pages for a subtopic are included, end page should be the start page of the topic
92
- that comes next after the extracted one in contents section.
93
-
94
- Examples:
95
- 1. Given this table of contents:
96
-
97
- 1 Introduction 2
98
- Why choose Edexcel A Level Mathematics? - 2
99
- Supporting you in planning and implementing this qualification - 3
100
- Qualification at a glance - 5
101
- 2 Subject content and assessment information 7
102
- Paper 1 and Paper 2: Pure Mathematics - 11
103
- Paper 3: Statistics and Mechanics - 30
104
- Assessment Objectives - 40
105
- 3 Administration and general information – 42
106
- Entries - 42
107
- Access arrangements, reasonable adjustments, special consideration and malpractice - 42
108
- Student recruitment and progression - 45
109
- Appendix 1: Formulae49
110
- Appendix 2: Notation53
111
- Appendix 3: Use of calculators 59
112
- Appendix 4: Assessment Objectives60
113
- Appendix 5: The context for the development of this qualification 62
114
- Appendix 6: Transferable skills64
115
- Appendix 7: Level 3 Extended Project qualification 65
116
- Appendix 8: Codes – 67
117
-
118
- The correct output should be:
119
-
120
- {{
121
- "Paper 1 and Paper 2: Pure Mathematics": [11, 29],
122
- "Paper 3: Statistics and Mechanics": [30, 42]
123
- }}
124
-
125
- 2. Given this table of contents:
126
-
127
- Qualification at a glance 1
128
- Assessment Objectives and weightings - 4
129
- Knowledge, skills and understanding 5
130
- Theme 1: Introduction to markets and market failure - 5
131
- Theme 2: The UK economy performance and policies - 11
132
- Theme 3: Business behaviour and the labour market - 21
133
- Theme 4: A global perspective - 29
134
- Assessment 39
135
- Assessment summary - 39
136
- Assessment objectives - 41
137
- Assessment overview - 42
138
- Breakdown of assessment objectives - 42
139
- Synoptic assessment - 43
140
- Discount code and performance tables - 43
141
- Access arrangements, reasonable adjustments and special consideration - 44
142
- Malpractice - 45
143
- Equality Act 2010 and Pearson equality policy - 45
144
- Synoptic assessment - 46
145
- Awarding and reporting - 47
146
- Other information 49
147
- Student recruitment -49
148
- Prior learning and other requirements -49
149
- Progression - 49
150
- Appendix 1: Transferable skills53
151
- Appendix 2: Level 3 Extended Project qualification 55
152
- Appendix 3: Quantitative skills 59
153
- Appendix 4: Codes61
154
- Appendix 5: Index – 63
155
-
156
- The correct output should be:
157
-
158
- {{
159
- "Theme 1: Introduction to markets and market failure": [5, 10]
160
- "Theme 2: The UK economy performance and policies": - [11, 20]
161
- "Theme 3: Business behaviour and the labour market": [21, 28]
162
- "Theme 4: A global perspective": [29, 38]
163
- }}
164
- Where pages are 1-based.
165
- No extra text. Only JSON.
166
- TEXT:
167
- {text}
168
  """
169
 
170
  try:
171
- response = self.client.models.generate_content(
172
- model=self.model,
 
173
  contents=[prompt],
174
- config=types.GenerateContentConfig(temperature=0.)
175
  )
176
- return response.text.strip() if (response and response.text) else "{}"
 
 
 
 
177
  except Exception as e:
178
- logger.error(f"[ContentsExtractor] LLM error: {e}")
179
- return "{}"
180
-
181
-
182
- # -------------------------------------------------------------------
183
- # 2) Helper to create a PDF subset from specific pages
184
- # -------------------------------------------------------------------
185
- def create_subset_pdf(pdf_bytes: bytes, page_indices: List[int]) -> bytes:
186
- """
187
- Return a new PDF containing only the pages in `page_indices` (0-based).
188
- If empty, returns original.
189
- """
190
- if not page_indices:
191
- return pdf_bytes
192
-
193
- doc = fitz.open(stream=pdf_bytes, filetype="pdf")
194
- new_doc = fitz.open()
195
- for p in sorted(set(page_indices)):
196
- if 0 <= p < doc.page_count:
197
- new_doc.insert_pdf(doc, from_page=p, to_page=p)
198
- else:
199
- logger.warning(f"Page index {p} out of range.")
200
- out_bytes = new_doc.tobytes()
201
- new_doc.close()
202
- doc.close()
203
- return out_bytes
204
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
205
 
206
- # -------------------------------------------------------------------
207
- # 3) Gemini-based table classification and description
208
- # -------------------------------------------------------------------
209
  def call_gemini_for_table_classification(image_data: bytes) -> str:
210
- if genai is None or types is None:
211
- logger.warning("Gemini not available. Return NO_TABLE.")
212
- return "NO_TABLE"
213
-
214
- prompt = """Is this image a 2-col table, 3-col table, or not a table? Return 'TWO_COLUMN','THREE_COLUMN','NO_TABLE'."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
215
  try:
216
- client = genai.Client(api_key=GEMINI_API_KEY)
217
  response = client.models.generate_content(
218
  model="gemini-2.0-flash",
219
  contents=[
@@ -223,7 +229,7 @@ def call_gemini_for_table_classification(image_data: bytes) -> str:
223
  {
224
  "inline_data": {
225
  "mime_type": "image/jpeg",
226
- "data": base64.b64encode(image_data).decode("utf-8")
227
  }
228
  }
229
  ]
@@ -231,25 +237,41 @@ def call_gemini_for_table_classification(image_data: bytes) -> str:
231
  ],
232
  config=types.GenerateContentConfig(temperature=0.0)
233
  )
234
- out = response.text.strip().upper() if (response and response.text) else "NO_TABLE"
235
- if "THREE" in out:
 
236
  return "THREE_COLUMN"
237
- elif "TWO" in out:
238
  return "TWO_COLUMN"
239
  else:
240
  return "NO_TABLE"
 
241
  except Exception as e:
242
- logger.error(f"[call_gemini_for_table_classification] error: {e}")
243
  return "NO_TABLE"
244
 
 
 
 
 
245
  def call_gemini_for_image_description(image_data: bytes) -> str:
246
- if genai is None or types is None:
247
- logger.warning("Gemini not available. Return fallback desc.")
248
- return "Image description unavailable"
 
 
 
 
 
 
 
 
 
 
 
249
 
250
- prompt_text = """Short 20-word max summary if not a table. If it's an MCQ, mention 'MCQ: ...'."""
251
  try:
252
- client = genai.Client(api_key=GEMINI_API_KEY)
253
  response = client.models.generate_content(
254
  model="gemini-2.0-flash",
255
  contents=[
@@ -259,7 +281,7 @@ def call_gemini_for_image_description(image_data: bytes) -> str:
259
  {
260
  "inline_data": {
261
  "mime_type": "image/jpeg",
262
- "data": base64.b64encode(image_data).decode("utf-8")
263
  }
264
  }
265
  ]
@@ -267,21 +289,16 @@ def call_gemini_for_image_description(image_data: bytes) -> str:
267
  ],
268
  config=types.GenerateContentConfig(temperature=0.0)
269
  )
270
- return response.text.strip() if (response and response.text) else "Image description unavailable"
 
271
  except Exception as e:
272
- logger.error(f"[call_gemini_for_image_description] error: {e}")
273
  return "Image description unavailable"
274
 
275
-
276
- # -------------------------------------------------------------------
277
- # 4) LocalImageWriter that removes all text from final .md
278
- # -------------------------------------------------------------------
279
  class LocalImageWriter:
280
  """
281
- - Receives images from doc_analyze
282
- - Classifies them as table or no_table
283
- - Replaces single table lines with row/cell references
284
- - Output MD has only lines referencing images
285
  """
286
  def __init__(self, output_folder: str):
287
  self.output_folder = output_folder
@@ -297,6 +314,7 @@ class LocalImageWriter:
297
  self._img_count += 1
298
  local_filename = f"img_{self._img_count}.png"
299
  local_path = os.path.join(self.images_dir, local_filename)
 
300
  with open(local_path, "wb") as f:
301
  f.write(data)
302
 
@@ -310,9 +328,9 @@ class LocalImageWriter:
310
 
311
  def post_process(self, key: str, md_content: str) -> str:
312
  # 1) Table classification
313
- with concurrent.futures.ThreadPoolExecutor(max_workers=len(self.descriptions)) as exe:
314
  fut_map = {
315
- exe.submit(call_gemini_for_table_classification, info["data"]): p
316
  for p, info in self.descriptions.items()
317
  }
318
  for fut in concurrent.futures.as_completed(fut_map):
@@ -321,15 +339,15 @@ class LocalImageWriter:
321
  classification = fut.result()
322
  self.descriptions[path]['table_classification'] = classification
323
  except Exception as e:
324
- logger.error(f"Classification error for {path}: {e}")
325
  self.descriptions[path]['table_classification'] = "NO_TABLE"
326
 
327
- # 2) If NO_TABLE => short description
328
- with concurrent.futures.ThreadPoolExecutor(max_workers=len(self.descriptions)) as exe:
329
  fut_map2 = {}
330
  for p, info in self.descriptions.items():
331
  if info['table_classification'] == "NO_TABLE":
332
- fut = exe.submit(call_gemini_for_image_description, info["data"])
333
  fut_map2[fut] = p
334
 
335
  for fut in concurrent.futures.as_completed(fut_map2):
@@ -338,10 +356,10 @@ class LocalImageWriter:
338
  desc = fut.result()
339
  self.descriptions[path]['final_alt'] = desc
340
  except Exception as e:
341
- logger.error(f"Desc error for {path}: {e}")
342
  self.descriptions[path]['final_alt'] = "Image description unavailable"
343
 
344
- # 3) If 2-col or 3-col => "HAS TO BE PROCESSED"
345
  for p, info in self.descriptions.items():
346
  cls = info['table_classification']
347
  if cls == "TWO_COLUMN":
@@ -353,20 +371,22 @@ class LocalImageWriter:
353
 
354
  # 4) Replace placeholders
355
  for p, info in self.descriptions.items():
356
- old_tag = f"![]({key}{p})"
357
- new_tag = f"![{info['final_alt']}]({info['relative_path']})"
358
- md_content = md_content.replace(old_tag, new_tag)
359
 
360
- # 5) For "HAS TO BE PROCESSED" => run TableExtractor => row/cell references
361
  md_content = self._process_table_images_in_markdown(md_content)
362
 
363
- # 6) Keep only lines referencing images
364
  final_lines = []
365
  for line in md_content.split("\n"):
366
- line = line.strip()
367
- if re.match(r"^!\[.*\]\(.*\)$", line):
368
- final_lines.append(line)
369
- return "\n".join(final_lines)
 
 
370
 
371
  def _process_table_images_in_markdown(self, md_content: str) -> str:
372
  pattern = r"!\[HAS TO BE PROCESSED - (two|three) column table\]\(([^)]+)\)"
@@ -380,14 +400,17 @@ class LocalImageWriter:
380
  try:
381
  if col_type.lower() == 'two':
382
  extractor = TableExtractor(
 
383
  merge_two_col_rows=True,
384
  enable_subtopic_merge=True,
385
  subtopic_threshold=0.2
386
  )
387
  else:
388
  extractor = TableExtractor(
 
389
  merge_two_col_rows=False,
390
- enable_subtopic_merge=False
 
391
  )
392
 
393
  row_boxes = extractor.process_image(abs_image_path)
@@ -395,6 +418,7 @@ class LocalImageWriter:
395
  os.makedirs(out_folder, exist_ok=True)
396
  extractor.save_extracted_cells(abs_image_path, row_boxes, out_folder)
397
 
 
398
  snippet_lines = ["**Extracted table cells:**"]
399
  for i, row in enumerate(row_boxes):
400
  row_dir = os.path.join(out_folder, f"row_{i}")
@@ -407,22 +431,13 @@ class LocalImageWriter:
407
  new_snippet = "\n".join(snippet_lines)
408
  old_line = f"![HAS TO BE PROCESSED - {col_type} column table]({image_path})"
409
  md_content = md_content.replace(old_line, new_snippet)
 
410
  except Exception as e:
411
  logger.error(f"Error processing table image {image_path}: {e}")
412
 
413
  return md_content
414
 
415
-
416
- # -------------------------------------------------------------------
417
- # 5) Final Pipeline
418
- # -------------------------------------------------------------------
419
- class MineruPipelineForSubtopics:
420
- """
421
- 1) Extract ~10 pages to parse contents with Gemini
422
- 2) Identify subtopic pages for 'Paper 1 and Paper 2: Pure Mathematics' and 'Paper 3: Statistics and Mechanics'
423
- 3) Create subset PDF with those pages
424
- 4) doc_analyze => only images => final MD with table references
425
- """
426
  def __init__(self, output_folder: str):
427
  self.output_folder = output_folder
428
  os.makedirs(self.output_folder, exist_ok=True)
@@ -432,7 +447,7 @@ class MineruPipelineForSubtopics:
432
  self.table_enable = False
433
  self.language = "en"
434
 
435
- self.contents_extractor = ContentsExtractor(api_key=GEMINI_API_KEY)
436
 
437
  def cleanup_gpu(self):
438
  try:
@@ -440,64 +455,54 @@ class MineruPipelineForSubtopics:
440
  torch.cuda.empty_cache()
441
  logger.info("GPU memory cleaned up.")
442
  except Exception as e:
443
- logger.error(f"Cleanup GPU error: {e}")
444
 
445
  def process(self, pdf_path: str) -> str:
 
 
 
 
 
 
 
446
  logger.info(f"Processing PDF: {pdf_path}")
447
  try:
448
- # Step 1) parse first pages => subtopics
449
- first_text = self.contents_extractor.extract_first_pages(pdf_path, num_pages=10)
450
- raw_json = self.contents_extractor.extract_contents(first_text)
451
- logger.info(f"[ContentsExtraction] raw LLM output: {raw_json}")
452
- try:
453
- subtopics_dict = json.loads(raw_json)
454
- except json.JSONDecodeError:
455
- logger.warning("Gemini did not return valid JSON. We'll parse entire doc.")
456
- subtopics_dict = {}
457
-
458
- # Step 2) gather pages from subtopics
459
- # We expect keys like "Paper 1 and Paper 2: Pure Mathematics", "Paper 3: Statistics and Mechanics"
460
- # If the LLM is correct, we'll get e.g. { "Paper 1 and Paper 2: Pure Mathematics": [11, 29], "Paper 3: Statistics and Mechanics": [30, 38] }
461
- pages_1_2 = []
462
- pages_3 = []
463
- if "Paper 1 and Paper 2: Pure Mathematics" in subtopics_dict:
464
- rng = subtopics_dict["Paper 1 and Paper 2: Pure Mathematics"]
465
- if len(rng) == 2:
466
- for p in range(rng[0], rng[1] + 1):
467
- pages_1_2.append(p)
468
-
469
- if "Paper 3: Statistics and Mechanics" in subtopics_dict:
470
- rng = subtopics_dict["Paper 3: Statistics and Mechanics"]
471
- if len(rng) == 2:
472
- for p in range(rng[0], rng[1] + 1):
473
- pages_3.append(p)
474
-
475
- all_subtopic_pages = pages_1_2 + pages_3
476
- if not all_subtopic_pages:
477
- logger.warning("No subtopic pages found. We'll do entire doc.")
478
- subset_pdf_bytes = open(pdf_path, "rb").read()
479
  else:
480
- # Convert to 0-based
481
- doc = fitz.open(pdf_path)
482
- max_page = doc.page_count
 
 
 
 
 
 
 
 
483
  doc.close()
484
 
485
  zero_based = []
486
- for p in all_subtopic_pages:
487
  z = p - 1
488
- if 0 <= z < max_page:
489
  zero_based.append(z)
490
  zero_based = sorted(set(zero_based))
491
- logger.info(f"Final subtopic pages (0-based): {zero_based}")
492
 
493
- # If empty => entire doc
494
- if not zero_based:
495
- subset_pdf_bytes = open(pdf_path, "rb").read()
496
  else:
497
- original_bytes = open(pdf_path, "rb").read()
498
- subset_pdf_bytes = create_subset_pdf(original_bytes, zero_based)
 
 
499
 
500
- # Step 3) doc_analyze => images => final MD
501
  dataset = PymuDocDataset(subset_pdf_bytes)
502
  inference = doc_analyze(
503
  dataset,
@@ -509,30 +514,45 @@ class MineruPipelineForSubtopics:
509
  )
510
  logger.info("doc_analyze complete. Extracting images...")
511
 
512
- writer = LocalImageWriter(self.output_folder)
513
- pipe_result = inference.pipe_ocr_mode(writer, lang=self.language)
 
514
 
515
  md_content = pipe_result.get_markdown("local-unique-prefix/")
516
- final_markdown = writer.post_process("local-unique-prefix/", md_content)
517
 
 
518
  md_path = os.path.join(self.output_folder, "final_output.md")
519
  with open(md_path, "w", encoding="utf-8") as f:
520
  f.write(final_markdown)
521
 
522
  logger.info(f"Markdown saved to: {md_path}")
523
  return final_markdown
 
524
  finally:
525
  self.cleanup_gpu()
526
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
527
 
528
- # -------------------------------------------------------------------
529
- # Example usage
530
- # -------------------------------------------------------------------
531
  if __name__ == "__main__":
532
  input_pdf = "/home/user/app/input_output/a-level-pearson-mathematics-specification.pdf"
533
  output_dir = "/home/user/app/input_output/outputed"
534
 
535
- processor = MineruPipelineForSubtopics(output_folder=output_dir)
536
- final_md = processor.process(input_pdf)
537
- print("\n===== FINAL .MD =====\n")
538
- # print(final_md)
 
5
  import json
6
  import logging
7
  import fitz # PyMuPDF (pip install pymupdf)
 
8
  import base64
9
  import concurrent.futures
10
+ from io import BytesIO
11
  from typing import List, Dict, Any
12
 
13
+ from google import genai
14
+ from google.genai import types
15
  import torch
16
  import cv2
17
 
 
18
  from magic_pdf.data.dataset import PymuDocDataset
19
  from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
20
 
 
21
  from table_row_extraction import TableExtractor
22
 
23
  logging.basicConfig(level=logging.INFO)
24
  logger = logging.getLogger(__name__)
25
  logger.setLevel(logging.INFO)
26
 
27
+ def create_subset_pdf(original_pdf_bytes: bytes, page_indices: List[int]) -> bytes:
28
+ """
29
+ Using PyMuPDF, create a new PDF that contains only the pages in page_indices (0-based).
30
+ Return the resulting PDF as bytes.
31
+ """
32
+ if not page_indices:
33
+ return original_pdf_bytes # If empty, just return original
34
 
35
+ doc = fitz.open(stream=original_pdf_bytes, filetype="pdf")
36
+ new_doc = fitz.open() # empty PDF to insert pages into
37
+
38
+ sorted_pages = sorted(set(page_indices))
39
+ for p in sorted_pages:
40
+ if 0 <= p < doc.page_count:
41
+ new_doc.insert_pdf(doc, from_page=p, to_page=p)
42
+ else:
43
+ logger.warning(f"Page index {p} is out of range, skipping.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
 
45
+ subset_bytes = new_doc.tobytes()
46
+ new_doc.close()
47
+ doc.close()
48
+ return subset_bytes
49
+
50
+ class GeminiTopicExtractor:
51
+ """
52
+ Uses Gemini to parse the PDF text, looking specifically for
53
+ "2 Subject content and assessment information" and subtopics with pages.
54
+ """
55
+ def __init__(self, api_key: str = None):
56
+ self.api_key = api_key or os.getenv("GEMINI_API_KEY", "AIzaSyDtoakpXa2pjJwcQB6TJ5QaXHNSA5JxcrU")
57
+
58
+ def extract_subtopics(self, pdf_path: str) -> Dict[str, Any]:
59
  """
60
+ 1) Read entire PDF text
61
+ 2) Ask Gemini for JSON structure like:
62
+ {
63
+ "2 Subject content and assessment information": {
64
+ "Paper 1 and Paper 2: Pure Mathematics": [11, 29],
65
+ "Paper 3: Statistics and Mechanics": [30, 42]
66
+ }
67
+ }
68
+ 3) Return parsed JSON
69
  """
70
+ text_content = self._read_entire_pdf(pdf_path)
71
+ if not text_content.strip():
72
+ logger.warning("No text extracted from PDF. Returning empty JSON.")
73
+ return {}
74
 
75
  prompt = f"""
76
+ You will be provided with the first pages of an exam board document. Your goal is to extract
77
+ the main subject-related topics from the "Contents" section and structure them in a valid JSON format.
78
+
79
+ Instructions:
80
+ 1. Identify the "Contents" section, which lists all topics, subtopics, and their corresponding pages.
81
+ 2. Extract only the **highest-level, subject-related subtopics** (ignore organizational or administrative sections).
82
+ 3. For subtopics, include the full range of pages from the first to the last subtopic.
83
+ 4. Return the output in the following JSON format:
84
+
85
+ {{
86
+ "topic_name": [start_page, end_page]
87
+ }}
88
+
89
+ Important Notes:
90
+ - Ignore non-subject-related sections (e.g., "Introduction", "Exam Guidelines", "Appendices", "Assessment, Qualification at a glance").
91
+ - The extracted subtopics should represent major academic areas, not organizational or structural elements.
92
+ - Make sure that all of the pages for a subtopic are included, end page should be the start page of the topic
93
+ that comes next after the extracted one in contents section.
94
+
95
+ Examples:
96
+ 1. Given this table of contents:
97
+
98
+ 1 Introduction – 2
99
+ Why choose Edexcel A Level Mathematics? - 2
100
+ Supporting you in planning and implementing this qualification - 3
101
+ Qualification at a glance - 5
102
+ 2 Subject content and assessment information – 7
103
+ Paper 1 and Paper 2: Pure Mathematics - 11
104
+ Paper 3: Statistics and Mechanics - 30
105
+ Assessment Objectives - 40
106
+ 3 Administration and general information – 42
107
+ Entries - 42
108
+ Access arrangements, reasonable adjustments, special consideration and malpractice - 42
109
+ Student recruitment and progression - 45
110
+ Appendix 1: Formulae 49
111
+ Appendix 2: Notation53
112
+ Appendix 3: Use of calculators 59
113
+ Appendix 4: Assessment Objectives60
114
+ Appendix 5: The context for the development of this qualification 62
115
+ Appendix 6: Transferable skills64
116
+ Appendix 7: Level 3 Extended Project qualification 65
117
+ Appendix 8: Codes67
118
+
119
+ The correct output should be:
120
+
121
+ {{
122
+ "Paper 1 and Paper 2: Pure Mathematics": [11, 29],
123
+ "Paper 3: Statistics and Mechanics": [30, 42]
124
+ }}
125
+
126
+ 2. Given this table of contents:
127
+
128
+ Qualification at a glance – 1
129
+ Assessment Objectives and weightings - 4
130
+ Knowledge, skills and understanding 5
131
+ Theme 1: Introduction to markets and market failure - 5
132
+ Theme 2: The UK economy – performance and policies - 11
133
+ Theme 3: Business behaviour and the labour market - 21
134
+ Theme 4: A global perspective - 29
135
+ Assessment 39
136
+ Assessment summary - 39
137
+ Assessment objectives - 41
138
+ Assessment overview - 42
139
+ Breakdown of assessment objectives - 42
140
+ Synoptic assessment - 43
141
+ Discount code and performance tables - 43
142
+ Access arrangements, reasonable adjustments and special consideration - 44
143
+ Malpractice - 45
144
+ Equality Act 2010 and Pearson equality policy - 45
145
+ Synoptic assessment - 46
146
+ Awarding and reporting - 47
147
+ Other information 49
148
+ Student recruitment -49
149
+ Prior learning and other requirements -49
150
+ Progression - 49
151
+ Appendix 1: Transferable skills – 53
152
+ Appendix 2: Level 3 Extended Project qualification 55
153
+ Appendix 3: Quantitative skills59
154
+ Appendix 4: Codes61
155
+ Appendix 5: Index63
156
+
157
+ The correct output should be:
158
+
159
+ {{
160
+ "Theme 1: Introduction to markets and market failure": [5, 10]
161
+ "Theme 2: The UK economy – performance and policies": - [11, 20]
162
+ "Theme 3: Business behaviour and the labour market": [21, 28]
163
+ "Theme 4: A global perspective": [29, 38]
164
+ }}
165
+
166
+ Now, extract topics from this text: {text_content}
 
 
 
167
  """
168
 
169
  try:
170
+ client = genai.Client(api_key=self.api_key)
171
+ response = client.models.generate_content(
172
+ model="gemini-2.0-flash",
173
  contents=[prompt],
174
+ config=types.GenerateContentConfig(temperature=0.0)
175
  )
176
+ raw_text = response.text.strip() if response and response.text else "{}"
177
+ # Clean up any triple backticks
178
+ cleaned = raw_text.replace("```json", "").replace("```", "")
179
+ data = json.loads(cleaned)
180
+ return data
181
  except Exception as e:
182
+ logger.error(f"Error from Gemini subtopic extraction: {e}")
183
+ return {}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
184
 
185
+ def _read_entire_pdf(self, pdf_path: str) -> str:
186
+ """
187
+ Return the entire PDF text by concatenating all pages.
188
+ """
189
+ text_parts = []
190
+ try:
191
+ doc = fitz.open(pdf_path)
192
+ for p in range(doc.page_count):
193
+ page_text = doc.load_page(p).get_text()
194
+ text_parts.append(page_text)
195
+ doc.close()
196
+ except Exception as e:
197
+ logger.error(f"Could not open/read PDF: {e}")
198
+ return "\n".join(text_parts)
199
 
 
 
 
200
  def call_gemini_for_table_classification(image_data: bytes) -> str:
201
+ prompt = """You are given an image. Determine if it shows a table that has exactly 2 or 3 columns.
202
+ The three-column 'table' image include such key features:
203
+ - Three columns header columns
204
+ - Headers like 'Topics', 'Content', 'Guidelines'
205
+ - Numbered sections (e.g., 8.4, 9.1)
206
+ - Educational curriculum-style structure
207
+ The two-column 'table' image include such key features:
208
+ - Two columns header columns
209
+ - Headers like 'Subject content' and 'Additional information'
210
+ - Numbered sections (e.g., 2.1, 3.4)
211
+ - Educational curriculum-style structure
212
+ - Bullet description in 'Additional information'
213
+ If the image is a relevant table with 2 columns, respond with 'TWO_COLUMN'.
214
+ If the image is a relevant table with 3 columns, respond with 'THREE_COLUMN'.
215
+ If the image does not show a table at all, respond with 'NO_TABLE'.
216
+ Return only one of these exact labels as your entire response:
217
+ TWO_COLUMN
218
+ THREE_COLUMN
219
+ NO_TABLE
220
+ """
221
  try:
222
+ client = genai.Client(api_key=os.getenv("GEMINI_API_KEY", "AIzaSyDtoakpXa2pjJwcQB6TJ5QaXHNSA5JxcrU"))
223
  response = client.models.generate_content(
224
  model="gemini-2.0-flash",
225
  contents=[
 
229
  {
230
  "inline_data": {
231
  "mime_type": "image/jpeg",
232
+ "data": base64.b64encode(image_data).decode('utf-8')
233
  }
234
  }
235
  ]
 
237
  ],
238
  config=types.GenerateContentConfig(temperature=0.0)
239
  )
240
+ classification = response.text.strip() if (response and response.text) else "NO_TABLE"
241
+ classification = classification.upper()
242
+ if "THREE" in classification:
243
  return "THREE_COLUMN"
244
+ elif "TWO" in classification:
245
  return "TWO_COLUMN"
246
  else:
247
  return "NO_TABLE"
248
+
249
  except Exception as e:
250
+ logger.error(f"Table classification error: {e}")
251
  return "NO_TABLE"
252
 
253
+
254
+ # -------------------------------------------------------------------
255
+ # Gemini-based image description (Mineru style)
256
+ # -------------------------------------------------------------------
257
  def call_gemini_for_image_description(image_data: bytes) -> str:
258
+ prompt_text = """The provided image is a part of a question paper or markscheme.
259
+ Extract all the necessary information from the image to be able to identify the question.
260
+ To identify the question, we only need the following: question number and question part.
261
+ Don't include redundant information.
262
+ For example, if image contains text like: "Q1 Part A Answer: Life on earth was created by diety..."
263
+ you should return just "Q1 Part A Mark Scheme"
264
+ If there is no text on this image, return the description of the image. 20 words max.
265
+ If there are not enough data, consider information from the surrounding context.
266
+ Additionally, if the image contains a truncated part, you must describe it and mark as a
267
+ part of some another image that goes before or after current image.
268
+ If the image is of a multiple-choice question’s options, then modify your answer by appending
269
+ 'MCQ: A [option] B [option] C [option] D [option]' (replacing [option] with the actual options).
270
+ Otherwise, follow the above instructions strictly.
271
+ """
272
 
 
273
  try:
274
+ client = genai.Client(api_key=os.getenv("GEMINI_API_KEY", "AIzaSyDtoakpXa2pjJwcQB6TJ5QaXHNSA5JxcrU"))
275
  response = client.models.generate_content(
276
  model="gemini-2.0-flash",
277
  contents=[
 
281
  {
282
  "inline_data": {
283
  "mime_type": "image/jpeg",
284
+ "data": base64.b64encode(image_data).decode('utf-8')
285
  }
286
  }
287
  ]
 
289
  ],
290
  config=types.GenerateContentConfig(temperature=0.0)
291
  )
292
+ return response.text.strip() if response and response.text else "Image description unavailable"
293
+
294
  except Exception as e:
295
+ logger.error(f"Gemini image description error: {e}")
296
  return "Image description unavailable"
297
 
 
 
 
 
298
  class LocalImageWriter:
299
  """
300
+ Only writes images, does table classification, then modifies final MD
301
+ so that we keep only table references. We do not keep any text lines.
 
 
302
  """
303
  def __init__(self, output_folder: str):
304
  self.output_folder = output_folder
 
314
  self._img_count += 1
315
  local_filename = f"img_{self._img_count}.png"
316
  local_path = os.path.join(self.images_dir, local_filename)
317
+
318
  with open(local_path, "wb") as f:
319
  f.write(data)
320
 
 
328
 
329
  def post_process(self, key: str, md_content: str) -> str:
330
  # 1) Table classification
331
+ with concurrent.futures.ThreadPoolExecutor(max_workers=len(self.descriptions)) as executor:
332
  fut_map = {
333
+ executor.submit(call_gemini_for_table_classification, info["data"]): p
334
  for p, info in self.descriptions.items()
335
  }
336
  for fut in concurrent.futures.as_completed(fut_map):
 
339
  classification = fut.result()
340
  self.descriptions[path]['table_classification'] = classification
341
  except Exception as e:
342
+ logger.error(f"[Gemini Table Classification Error for {path}]: {e}")
343
  self.descriptions[path]['table_classification'] = "NO_TABLE"
344
 
345
+ # 2) If NO_TABLE => normal gemini-based description
346
+ with concurrent.futures.ThreadPoolExecutor(max_workers=len(self.descriptions)) as executor:
347
  fut_map2 = {}
348
  for p, info in self.descriptions.items():
349
  if info['table_classification'] == "NO_TABLE":
350
+ fut = executor.submit(call_gemini_for_image_description, info['data'])
351
  fut_map2[fut] = p
352
 
353
  for fut in concurrent.futures.as_completed(fut_map2):
 
356
  desc = fut.result()
357
  self.descriptions[path]['final_alt'] = desc
358
  except Exception as e:
359
+ logger.error(f"[Gemini Desc Error for {path}]: {e}")
360
  self.descriptions[path]['final_alt'] = "Image description unavailable"
361
 
362
+ # 3) If 2/3-col => "HAS TO BE PROCESSED"
363
  for p, info in self.descriptions.items():
364
  cls = info['table_classification']
365
  if cls == "TWO_COLUMN":
 
371
 
372
  # 4) Replace placeholders
373
  for p, info in self.descriptions.items():
374
+ old_md = f"![]({key}{p})"
375
+ new_md = f"![{info['final_alt']}]({info['relative_path']})"
376
+ md_content = md_content.replace(old_md, new_md)
377
 
378
+ # 5) For "HAS TO BE PROCESSED" => run TableExtractor => replace single line with row/cell lines
379
  md_content = self._process_table_images_in_markdown(md_content)
380
 
381
+ # 6) **Remove all text** => keep only lines that are image references
382
  final_lines = []
383
  for line in md_content.split("\n"):
384
+ # We only keep lines that start with "!" or have "!["
385
+ # (i.e. lines referencing images)
386
+ if re.match(r"^\!\[.*\]\(.*\)", line.strip()):
387
+ final_lines.append(line.strip())
388
+ new_md = "\n".join(final_lines)
389
+ return new_md
390
 
391
  def _process_table_images_in_markdown(self, md_content: str) -> str:
392
  pattern = r"!\[HAS TO BE PROCESSED - (two|three) column table\]\(([^)]+)\)"
 
400
  try:
401
  if col_type.lower() == 'two':
402
  extractor = TableExtractor(
403
+ skip_header=True,
404
  merge_two_col_rows=True,
405
  enable_subtopic_merge=True,
406
  subtopic_threshold=0.2
407
  )
408
  else:
409
  extractor = TableExtractor(
410
+ skip_header=True,
411
  merge_two_col_rows=False,
412
+ enable_subtopic_merge=False,
413
+ subtopic_threshold=0.2
414
  )
415
 
416
  row_boxes = extractor.process_image(abs_image_path)
 
418
  os.makedirs(out_folder, exist_ok=True)
419
  extractor.save_extracted_cells(abs_image_path, row_boxes, out_folder)
420
 
421
+ # Build snippet
422
  snippet_lines = ["**Extracted table cells:**"]
423
  for i, row in enumerate(row_boxes):
424
  row_dir = os.path.join(out_folder, f"row_{i}")
 
431
  new_snippet = "\n".join(snippet_lines)
432
  old_line = f"![HAS TO BE PROCESSED - {col_type} column table]({image_path})"
433
  md_content = md_content.replace(old_line, new_snippet)
434
+
435
  except Exception as e:
436
  logger.error(f"Error processing table image {image_path}: {e}")
437
 
438
  return md_content
439
 
440
+ class MineruNoTextProcessor:
 
 
 
 
 
 
 
 
 
 
441
  def __init__(self, output_folder: str):
442
  self.output_folder = output_folder
443
  os.makedirs(self.output_folder, exist_ok=True)
 
447
  self.table_enable = False
448
  self.language = "en"
449
 
450
+ self.subtopic_extractor = GeminiTopicExtractor()
451
 
452
  def cleanup_gpu(self):
453
  try:
 
455
  torch.cuda.empty_cache()
456
  logger.info("GPU memory cleaned up.")
457
  except Exception as e:
458
+ logger.error(f"Error during GPU cleanup: {e}")
459
 
460
  def process(self, pdf_path: str) -> str:
461
+ """
462
+ 1) Extract subtopics JSON from the PDF
463
+ 2) Flatten page ranges for subtopics
464
+ 3) Subset PDF
465
+ 4) doc_analyze => images => produce MD with only table lines
466
+ 5) Return final MD
467
+ """
468
  logger.info(f"Processing PDF: {pdf_path}")
469
  try:
470
+ # 1) Extract subtopics
471
+ data = self.subtopic_extractor.extract_subtopics(pdf_path)
472
+ if not data or "2 Subject content and assessment information" not in data:
473
+ logger.warning("Gemini did not return '2 Subject content...' or data is empty.")
474
+ page_indices = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
475
  else:
476
+ # 2) Flatten pages
477
+ page_indices = self._collect_page_indices(data["2 Subject content and assessment information"])
478
+
479
+ with open(pdf_path, "rb") as f:
480
+ original_pdf_bytes = f.read()
481
+
482
+ # If no pages found => entire doc
483
+ if page_indices:
484
+ # Convert from 1-based => 0-based
485
+ doc = fitz.open(stream=original_pdf_bytes, filetype="pdf")
486
+ max_p = doc.page_count
487
  doc.close()
488
 
489
  zero_based = []
490
+ for p in page_indices:
491
  z = p - 1
492
+ if 0 <= z < max_p:
493
  zero_based.append(z)
494
  zero_based = sorted(set(zero_based))
 
495
 
496
+ if zero_based:
497
+ logger.info(f"Subtopic pages (0-based): {zero_based}")
498
+ subset_pdf_bytes = create_subset_pdf(original_pdf_bytes, zero_based)
499
  else:
500
+ logger.warning("No valid subtopic pages, using entire doc.")
501
+ subset_pdf_bytes = original_pdf_bytes
502
+ else:
503
+ subset_pdf_bytes = original_pdf_bytes
504
 
505
+ # 3) doc_analyze with subset
506
  dataset = PymuDocDataset(subset_pdf_bytes)
507
  inference = doc_analyze(
508
  dataset,
 
514
  )
515
  logger.info("doc_analyze complete. Extracting images...")
516
 
517
+ # 4) Only images => table classification => final MD
518
+ image_writer = LocalImageWriter(self.output_folder)
519
+ pipe_result = inference.pipe_ocr_mode(image_writer, lang=self.language)
520
 
521
  md_content = pipe_result.get_markdown("local-unique-prefix/")
522
+ final_markdown = image_writer.post_process("local-unique-prefix/", md_content)
523
 
524
+ # 5) Save final
525
  md_path = os.path.join(self.output_folder, "final_output.md")
526
  with open(md_path, "w", encoding="utf-8") as f:
527
  f.write(final_markdown)
528
 
529
  logger.info(f"Markdown saved to: {md_path}")
530
  return final_markdown
531
+
532
  finally:
533
  self.cleanup_gpu()
534
 
535
+ def _collect_page_indices(self, subtopic_dict: Dict[str, List[int]]) -> List[int]:
536
+ """
537
+ Given something like:
538
+ {
539
+ "Paper 1 and Paper 2: Pure Mathematics": [11, 29],
540
+ "Paper 3: Statistics and Mechanics": [30, 42]
541
+ }
542
+ Return [11..29, 30..42] => a flattened list of pages
543
+ """
544
+ pages = []
545
+ for _, rng in subtopic_dict.items():
546
+ if isinstance(rng, list) and len(rng) == 2:
547
+ start_p, end_p = rng
548
+ # add all pages from start to end (inclusive)
549
+ for p in range(start_p, end_p + 1):
550
+ pages.append(p)
551
+ return pages
552
 
 
 
 
553
  if __name__ == "__main__":
554
  input_pdf = "/home/user/app/input_output/a-level-pearson-mathematics-specification.pdf"
555
  output_dir = "/home/user/app/input_output/outputed"
556
 
557
+ processor = MineruNoTextProcessor(output_folder=output_dir)
558
+ final_md = processor.process(input_pdf)