Arsenii11 commited on
Commit
85bcd32
·
1 Parent(s): 667780f

minor changes to JSON schema output

Browse files
__pycache__/inference_svm_model.cpython-310.pyc ADDED
Binary file (1.24 kB). View file
 
__pycache__/mineru_single.cpython-310.pyc ADDED
Binary file (10.6 kB). View file
 
__pycache__/table_row_extraction.cpython-310.pyc ADDED
Binary file (10.9 kB). View file
 
__pycache__/topic_extraction.cpython-310.pyc ADDED
Binary file (24.4 kB). View file
 
__pycache__/worker.cpython-310.pyc ADDED
Binary file (6.65 kB). View file
 
topic_extr.py ADDED
@@ -0,0 +1,833 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ import os
3
+ import re
4
+ import gc
5
+ import json
6
+ import logging
7
+ import fitz
8
+ import boto3
9
+ import base64
10
+ import time
11
+ import asyncio
12
+ import tempfile
13
+ import requests
14
+ from io import BytesIO
15
+ from typing import List, Dict, Any
16
+
17
+ import torch
18
+ import cv2
19
+ import numpy as np
20
+
21
+ from google import genai
22
+ from google.genai import types
23
+
24
+ from magic_pdf.data.dataset import PymuDocDataset
25
+ from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
26
+ from magic_pdf.data.data_reader_writer.base import DataWriter
27
+ from table_row_extraction import TableExtractor
28
+
29
+ logging.basicConfig(level=logging.INFO)
30
+ logger = logging.getLogger(__name__)
31
+ logger.setLevel(logging.INFO)
32
+ file_handler = logging.FileHandler("topic_extraction.log")
33
+ file_handler.setFormatter(logging.Formatter("%(asctime)s [%(levelname)s] %(name)s - %(message)s"))
34
+ logger.addHandler(file_handler)
35
+
36
+ _GEMINI_CLIENT = None
37
+
38
+ def unify_whitespace(text: str) -> str:
39
+ return re.sub(r"\s+", " ", text).strip()
40
+
41
+ def find_all_occurrences(pdf_bytes: bytes, search_text: str) -> List[int]:
42
+ doc = fitz.open(stream=pdf_bytes, filetype="pdf")
43
+ st_norm = unify_whitespace(search_text)
44
+ found = []
45
+ for i in range(doc.page_count):
46
+ raw = doc[i].get_text("raw")
47
+ norm = unify_whitespace(raw)
48
+ if st_norm in norm:
49
+ found.append(i)
50
+ doc.close()
51
+ return sorted(found)
52
+
53
+ def create_subset_pdf(original_pdf_bytes: bytes, page_indices: List[int]) -> bytes:
54
+ if not page_indices:
55
+ raise ValueError("No page indices provided for subset creation.")
56
+ doc = fitz.open(stream=original_pdf_bytes, filetype="pdf")
57
+ new_doc = fitz.open()
58
+ for p in sorted(set(page_indices)):
59
+ if 0 <= p < doc.page_count:
60
+ new_doc.insert_pdf(doc, from_page=p, to_page=p)
61
+ else:
62
+ logger.error(f"Page index {p} out of range (0..{doc.page_count - 1}).")
63
+ raise ValueError(f"Page index {p} out of range.")
64
+ subset_bytes = new_doc.tobytes()
65
+ new_doc.close()
66
+ doc.close()
67
+ return subset_bytes
68
+
69
+ class s3Writer:
70
+ def __init__(self, ak: str, sk: str, bucket: str, endpoint_url: str):
71
+ self.bucket = bucket
72
+ self.client = boto3.client(
73
+ 's3',
74
+ aws_access_key_id=ak,
75
+ aws_secret_access_key=sk,
76
+ endpoint_url=endpoint_url
77
+ )
78
+
79
+ def write(self, path: str, data: bytes) -> None:
80
+ try:
81
+ file_obj = BytesIO(data)
82
+ self.client.upload_fileobj(
83
+ file_obj,
84
+ self.bucket,
85
+ path
86
+ )
87
+ logger.info(f"Uploaded to S3: {path}")
88
+ except Exception as e:
89
+ logger.error(f"Failed to upload to S3: {str(e)}")
90
+ raise
91
+
92
+ def preprocess_image(image_data: bytes, max_dim: int = 600, quality: int = 60) -> bytes:
93
+ arr = np.frombuffer(image_data, np.uint8)
94
+ img = cv2.imdecode(arr, cv2.IMREAD_COLOR)
95
+ if img is not None:
96
+ h, w, _ = img.shape
97
+ if max(h, w) > max_dim:
98
+ scale = max_dim / float(max(h, w))
99
+ new_w = int(w * scale)
100
+ new_h = int(h * scale)
101
+ img = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_AREA)
102
+ encode_params = [int(cv2.IMWRITE_JPEG_QUALITY), quality]
103
+ success, enc = cv2.imencode(".jpg", img, encode_params)
104
+ if success:
105
+ return enc.tobytes()
106
+ return image_data
107
+
108
+ def call_gemini_for_table_classification(image_data: bytes, api_key: str, max_retries: int = 1) -> str:
109
+ """
110
+ Existing Gemini call to classify an image as TWO_COLUMN, THREE_COLUMN, or NO_TABLE.
111
+ """
112
+ for attempt in range(max_retries + 1):
113
+ try:
114
+ prompt = """You are given an image. Determine if it shows a table that has exactly 2 or 3 columns.
115
+ The three-column 'table' image includes such key features:
116
+ - Three columns header
117
+ - Headers like 'Topics', 'Content', 'Guidelines'
118
+ - Possibly sections (e.g. 8.4, 9.1)
119
+ The two-column 'table' image includes such key features:
120
+ - Two columns
121
+ - Headers like 'Subject content' and 'Additional information'
122
+ - Possibly sections (e.g. 2.1, 3.4)
123
+ If the image is a relevant table with 2 columns, respond with 'TWO_COLUMN'.
124
+ If the image is a relevant table with 3 columns, respond with 'THREE_COLUMN'.
125
+ If the image does not show a table at all, respond with 'NO_TABLE'.
126
+ Return only one of these exact labels.
127
+ """
128
+ global _GEMINI_CLIENT
129
+ if _GEMINI_CLIENT is None:
130
+ _GEMINI_CLIENT = genai.Client(api_key=api_key)
131
+ client = _GEMINI_CLIENT
132
+
133
+ resp = client.models.generate_content(
134
+ model="gemini-2.0-flash",
135
+ contents=[
136
+ {
137
+ "parts": [
138
+ {"text": prompt},
139
+ {
140
+ "inline_data": {
141
+ "mime_type": "image/jpeg",
142
+ "data": base64.b64encode(image_data).decode('utf-8')
143
+ }
144
+ }
145
+ ]
146
+ }
147
+ ],
148
+ config=types.GenerateContentConfig(temperature=0.0)
149
+ )
150
+ if resp and resp.text:
151
+ classification = resp.text.strip().upper()
152
+ if "THREE" in classification:
153
+ return "THREE_COLUMN"
154
+ elif "TWO" in classification:
155
+ return "TWO_COLUMN"
156
+ return "NO_TABLE"
157
+ except Exception as e:
158
+ logger.error(f"Gemini table classification error: {e}")
159
+ if "503" in str(e):
160
+ return "NO_TABLE"
161
+ if attempt < max_retries:
162
+ time.sleep(0.5)
163
+ else:
164
+ return "NO_TABLE"
165
+
166
+ async def classify_image_async(image_data: bytes, api_key: str, max_retries: int = 1) -> str:
167
+ loop = asyncio.get_event_loop()
168
+ preprocessed = preprocess_image(image_data)
169
+ return await loop.run_in_executor(None, call_gemini_for_table_classification, preprocessed, api_key, max_retries)
170
+
171
+
172
+ def call_gemini_for_subtopic_identification(text: str, api_key: str, max_retries: int = 1) -> dict:
173
+ """
174
+ Sends the recognized text from a specification table to Gemini,
175
+ asking it to identify the main topic (like '2 Algebra and functions')
176
+ and subtopics (like '2.5', '3.4', etc.).
177
+
178
+ Returns a dict of the form:
179
+ {
180
+ "title": "2 Algebra and functions",
181
+ "subtopics": ["2.5", "2.6", ...]
182
+ }
183
+
184
+ If Gemini can't find anything, it might return empty strings or lists.
185
+ """
186
+ for attempt in range(max_retries + 1):
187
+ try:
188
+ prompt = f"""
189
+ You are given text extracted from a table that represents topics and subtopics from an educational curriculum.
190
+ The text may include a main topic heading in the format: "<number> <Topic Name>", for example, "2 Algebra and functions".
191
+ It may also include subtopics in the format of "<number>.<number>", such as "2.5", "3.4", etc.
192
+ Extract and output a valid JSON object with exactly two keys:
193
+ - "title": the main topic heading (if found). If not found, use an empty string.
194
+ - "subtopics": an array of strings representing each subtopic number extracted from the text.
195
+ Output exactly in this JSON format with no additional text. For example:
196
+ {
197
+ "title": "2 Algebra and functions",
198
+ "subtopics": ["2.5", "2.6"]
199
+ }
200
+ Text:
201
+ {text}
202
+ """
203
+ global _GEMINI_CLIENT
204
+ if _GEMINI_CLIENT is None:
205
+ _GEMINI_CLIENT = genai.Client(api_key=api_key)
206
+ client = _GEMINI_CLIENT
207
+
208
+ resp = client.models.generate_content(
209
+ model="gemini-2.0-flash",
210
+ contents=[prompt],
211
+ config=types.GenerateContentConfig(temperature=0.0)
212
+ )
213
+
214
+ if not resp or not resp.text:
215
+ # If Gemini gives no response, fallback to empty.
216
+ return {"title": "", "subtopics": []}
217
+
218
+ raw = resp.text.strip()
219
+
220
+ # Attempt to parse raw as JSON
221
+ try:
222
+ data = json.loads(raw)
223
+ # Guarantee the structure we want
224
+ title = data.get("title", "")
225
+ subs = data.get("subtopics", [])
226
+ if not isinstance(subs, list):
227
+ subs = []
228
+ return {"title": title, "subtopics": subs}
229
+ except Exception:
230
+ # If JSON parse fails, return empty
231
+ return {"title": "", "subtopics": []}
232
+
233
+ except Exception as e:
234
+ # If there's an error or a 503, we can retry or bail out
235
+ if attempt < max_retries:
236
+ time.sleep(0.5)
237
+ else:
238
+ return {"title": "", "subtopics": []}
239
+
240
+
241
+
242
+ class S3ImageWriter(DataWriter):
243
+ def __init__(self, s3_writer: s3Writer, base_path: str, gemini_api_key: str):
244
+ self.s3_writer = s3_writer
245
+ self.base_path = base_path if base_path.endswith("/") else base_path + "/"
246
+ self.gemini_api_key = gemini_api_key
247
+ self.descriptions = {}
248
+ self._img_count = 0
249
+ self.extracted_tables = {}
250
+ # New attribute to store final subtopic JSON
251
+ self.extracted_subtopics = {}
252
+
253
+ def write(self, path: str, data: bytes) -> None:
254
+ self._img_count += 1
255
+ unique_id = f"img_{self._img_count}.jpg"
256
+ s3_key = f"{self.base_path}{unique_id}"
257
+ self.s3_writer.write(s3_key, data)
258
+ self.descriptions[path] = {
259
+ "data": data,
260
+ "s3_path": s3_key,
261
+ "table_classification": "NO_TABLE",
262
+ "final_alt": ""
263
+ }
264
+
265
+ async def post_process_async(self, key: str, md_content: str) -> str:
266
+ logger.info("Classifying images to detect tables.")
267
+ tasks = {
268
+ p: asyncio.create_task(classify_image_async(info["data"], self.gemini_api_key))
269
+ for p, info in self.descriptions.items()
270
+ }
271
+ results = await asyncio.gather(*tasks.values(), return_exceptions=True)
272
+ for p, result in zip(tasks.keys(), results):
273
+ if isinstance(result, Exception):
274
+ logger.error(f"Table classification error for {p}: {result}")
275
+ self.descriptions[p]['table_classification'] = "NO_TABLE"
276
+ else:
277
+ self.descriptions[p]['table_classification'] = result
278
+
279
+ # 2) Replace the original markdown references with alt text
280
+ for p, info in self.descriptions.items():
281
+ cls = info['table_classification']
282
+ if cls == "TWO_COLUMN":
283
+ info['final_alt'] = "HAS TO BE PROCESSED - two column table"
284
+ elif cls == "THREE_COLUMN":
285
+ info['final_alt'] = "HAS TO BE PROCESSED - three column table"
286
+ else:
287
+ info['final_alt'] = "NO_TABLE image"
288
+ md_content = md_content.replace(f"![]({key}{p})", f"![{info['final_alt']}]({info['s3_path']})")
289
+
290
+ md_content = await self._process_table_images_in_markdown(key, md_content)
291
+
292
+ # Filter final lines to keep only lines with images
293
+ final_lines = [
294
+ line.strip() for line in md_content.split("\n")
295
+ if re.match(r"^\!\[.*\]\(.*\)", line.strip())
296
+ ]
297
+ return "\n".join(final_lines)
298
+
299
+
300
+ async def _process_table_images_in_markdown(self, key: str, md_content: str) -> str:
301
+ pat = r"!\[HAS TO BE PROCESSED - (two|three) column table\]\(([^)]+)\)"
302
+ matches = re.findall(pat, md_content, flags=re.IGNORECASE)
303
+ if not matches:
304
+ return md_content
305
+
306
+ for (col_type, s3_key) in matches:
307
+ logger.info(f"Processing table image: {s3_key}, columns={col_type}")
308
+ img_data = None
309
+ for desc in self.descriptions.values():
310
+ if desc.get("s3_path") == s3_key:
311
+ img_data = desc.get("data")
312
+ break
313
+ if img_data is None:
314
+ logger.warning(f"No image data found for S3 key {s3_key}. Skipping.")
315
+ continue
316
+
317
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".jpg") as temp_file:
318
+ temp_file.write(img_data)
319
+ temp_path = temp_file.name
320
+
321
+ try:
322
+ if col_type.lower() == 'two':
323
+ extractor = TableExtractor(
324
+ skip_header=True,
325
+ merge_two_col_rows=True,
326
+ enable_subtopic_merge=True,
327
+ subtopic_threshold=0.2
328
+ )
329
+ else:
330
+ extractor = TableExtractor(
331
+ skip_header=True,
332
+ merge_two_col_rows=False,
333
+ enable_subtopic_merge=False,
334
+ subtopic_threshold=0.2
335
+ )
336
+ row_boxes = extractor.process_image(temp_path)
337
+
338
+ snippet = ["**Extracted table cells:**"]
339
+ cell_texts = []
340
+ for i, row in enumerate(row_boxes):
341
+ for j, box in enumerate(row):
342
+ cell_key = f"{self.base_path}cells/table_s3_{os.path.basename(s3_key)}_r{i}_c{j}.jpg"
343
+ self.s3_writer.write(cell_key, img_data) # or cell_data if you truly cropped
344
+
345
+ text = "..." # placeholder
346
+ cell_texts.append(text)
347
+
348
+ snippet.append(f"![Row {i} Col {j}]({cell_key})")
349
+
350
+ combined_text = "\n".join(cell_texts)
351
+
352
+ subtopic_info = call_gemini_for_subtopic_identification(combined_text, self.gemini_api_key)
353
+
354
+ # subtopic_info might be: {"title": "2 Algebra and functions", "subtopics": ["2.5"]}
355
+ final_json = {
356
+ "title": subtopic_info.get("title", ""),
357
+ "contents": [
358
+ {"type": "image", "key": s3_key}
359
+ ],
360
+ "children": []
361
+ }
362
+ for st in subtopic_info.get("subtopics", []):
363
+ final_json["children"].append({
364
+ "title": st,
365
+ "contents": [
366
+ {"type": "image", "key": f"subtopic_{st}_example.jpg"}
367
+ ]
368
+ })
369
+
370
+ self.extracted_subtopics[s3_key] = final_json
371
+
372
+ # Replace the original table image line in the markdown with the snippet
373
+ new_snip = "\n".join(snippet)
374
+ old_line = f"![HAS TO BE PROCESSED - {col_type} column table]({s3_key})"
375
+ md_content = md_content.replace(old_line, new_snip)
376
+
377
+ except Exception as e:
378
+ logger.error(f"Error processing table image {s3_key}: {e}")
379
+ finally:
380
+ os.remove(temp_path)
381
+
382
+ return md_content
383
+
384
+ def post_process(self, key: str, md_content: str) -> str:
385
+ return asyncio.run(self.post_process_async(key, md_content))
386
+
387
+
388
+ class LocalImageWriter(DataWriter):
389
+ def __init__(self, output_folder: str, gemini_api_key: str):
390
+ self.output_folder = output_folder
391
+ os.makedirs(self.output_folder, exist_ok=True)
392
+ self.descriptions = {}
393
+ self._img_count = 0
394
+ self.gemini_api_key = gemini_api_key
395
+
396
+ self.extracted_tables = {}
397
+
398
+ def write(self, path: str, data: bytes) -> None:
399
+ self._img_count += 1
400
+ unique_id = f"img_{self._img_count}.jpg"
401
+ self.descriptions[path] = {
402
+ "data": data,
403
+ "relative_path": unique_id,
404
+ "table_classification": "NO_TABLE",
405
+ "final_alt": ""
406
+ }
407
+ # Also save the original image locally for testing.
408
+ image_path = os.path.join(self.output_folder, unique_id)
409
+ with open(image_path, "wb") as f:
410
+ f.write(data)
411
+
412
+ async def post_process_async(self, key: str, md_content: str) -> str:
413
+ logger.info("Classifying images to detect tables.")
414
+ tasks = []
415
+ for p, info in self.descriptions.items():
416
+ tasks.append((p, classify_image_async(info["data"], self.gemini_api_key)))
417
+ for p, task in tasks:
418
+ try:
419
+ classification = await task
420
+ self.descriptions[p]['table_classification'] = classification
421
+ except Exception as e:
422
+ logger.error(f"Table classification error: {e}")
423
+ self.descriptions[p]['table_classification'] = "NO_TABLE"
424
+ for p, info in self.descriptions.items():
425
+ cls = info['table_classification']
426
+ if cls == "TWO_COLUMN":
427
+ info['final_alt'] = "HAS TO BE PROCESSED - two column table"
428
+ elif cls == "THREE_COLUMN":
429
+ info['final_alt'] = "HAS TO BE PROCESSED - three column table"
430
+ else:
431
+ info['final_alt'] = "NO_TABLE image"
432
+ md_content = md_content.replace(f"![]({key}{p})", f"![{info['final_alt']}]({info['relative_path']})")
433
+ md_content = self._process_table_images_in_markdown(md_content)
434
+ final_lines = []
435
+ for line in md_content.split("\n"):
436
+ if re.match(r"^\!\[.*\]\(.*\)", line.strip()):
437
+ final_lines.append(line.strip())
438
+ return "\n".join(final_lines)
439
+
440
+ def _process_table_images_in_markdown(self, md_content: str) -> str:
441
+ pat = r"!\[HAS TO BE PROCESSED - (two|three) column table\]\(([^)]+)\)"
442
+ matches = re.findall(pat, md_content, flags=re.IGNORECASE)
443
+ if not matches:
444
+ return md_content
445
+ for (col_type, image_id) in matches:
446
+ logger.info(f"Processing table image => {image_id}, columns={col_type}")
447
+ temp_path = os.path.join(self.output_folder, image_id)
448
+ desc_item = None
449
+ for k, val in self.descriptions.items():
450
+ if val["relative_path"] == image_id:
451
+ desc_item = val
452
+ break
453
+ if not desc_item:
454
+ logger.warning(f"No matching image data for {image_id}, skipping extraction.")
455
+ continue
456
+ if not os.path.exists(temp_path):
457
+ with open(temp_path, "wb") as f:
458
+ f.write(desc_item["data"])
459
+ try:
460
+ if col_type.lower() == 'two':
461
+ extractor = TableExtractor(
462
+ skip_header=True,
463
+ merge_two_col_rows=True,
464
+ enable_subtopic_merge=True,
465
+ subtopic_threshold=0.2
466
+ )
467
+ else:
468
+ extractor = TableExtractor(
469
+ skip_header=True,
470
+ merge_two_col_rows=False,
471
+ enable_subtopic_merge=False,
472
+ subtopic_threshold=0.2
473
+ )
474
+ row_boxes = extractor.process_image(temp_path)
475
+ out_folder = temp_path + "_rows"
476
+ os.makedirs(out_folder, exist_ok=True)
477
+ extractor.save_extracted_cells(temp_path, row_boxes, out_folder)
478
+ # List all extracted cell images relative to the output folder.
479
+ extracted_cells = []
480
+ for root, dirs, files in os.walk(out_folder):
481
+ for file in files:
482
+ rel_path = os.path.relpath(os.path.join(root, file), self.output_folder)
483
+ extracted_cells.append(rel_path)
484
+ # Save mapping for testing.
485
+ self.extracted_tables[image_id] = extracted_cells
486
+ snippet = ["**Extracted table cells:**"]
487
+ for i, row in enumerate(row_boxes):
488
+ row_dir = os.path.join(out_folder, f"row_{i}")
489
+ for j, _ in enumerate(row):
490
+ cell_file = f"col_{j}.jpg"
491
+ cell_path = os.path.join(row_dir, cell_file)
492
+ relp = os.path.relpath(cell_path, self.output_folder)
493
+ snippet.append(f"![Row {i} Col {j}]({relp})")
494
+ new_snip = "\n".join(snippet)
495
+ old_line = f"![HAS TO BE PROCESSED - {col_type} column table]({image_id})"
496
+ md_content = md_content.replace(old_line, new_snip)
497
+ except Exception as e:
498
+ logger.error(f"Error processing table image {image_id}: {e}")
499
+ finally:
500
+ if os.path.exists(temp_path):
501
+ os.remove(temp_path)
502
+ return md_content
503
+
504
+ def post_process(self, key: str, md_content: str) -> str:
505
+ return asyncio.run(self.post_process_async(key, md_content))
506
+
507
+ class GeminiTopicExtractor:
508
+ def __init__(self, api_key: str = None, num_pages: int = 14):
509
+ self.api_key = api_key or os.getenv("GEMINI_API_KEY", "")
510
+ self.num_pages = num_pages
511
+
512
+ def extract_subtopics(self, pdf_path: str) -> Dict[str, List[int]]:
513
+ first_pages_text = self._read_first_pages_raw(pdf_path, self.num_pages)
514
+ if not first_pages_text.strip():
515
+ logger.error("No text from first pages => cannot extract subtopics.")
516
+ return {}
517
+ prompt = f"""
518
+ You have the first pages of a PDF specification, including a table of contents.
519
+ Instructions:
520
+ 1. Identify the 'Contents' section listing all topics, subtopics, and their corresponding pages.
521
+ 2. Identify the major academic subtopics (common desired topic names "Paper X", "Theme X", "Content of X", "AS Unit X", "A2 Unit X", or similar headings).
522
+ 3. For each subtopic, give the range of pages [start_page, end_page] (1-based) from the table of contents.
523
+ 4. Output only valid JSON of the form:
524
+ {{
525
+ "Subtopic A": [start_page, end_page],
526
+ "Subtopic B": [start_page, end_page]
527
+ }}
528
+ 5. If you can't find any subtopics, return an empty JSON.
529
+ Important notes:
530
+ - The correct "end_page" must be the page number of the next topic or subtopic minus 1.
531
+ - The final output must be valid JSON only, with no extra text or code blocks.
532
+ Examples:
533
+ 1. Given this table of contents:
534
+ 1 Introduction – 2
535
+ Why choose Edexcel A Level Mathematics? - 2
536
+ Supporting you in planning and implementing this qualification - 3
537
+ Qualification at a glance - 5
538
+ 2 Subject content and assessment information – 7
539
+ Paper 1 and Paper 2: Pure Mathematics - 11
540
+ Paper 3: Statistics and Mechanics - 30
541
+ Assessment Objectives - 40
542
+ 3 Administration and general information – 42
543
+ Entries - 42
544
+ Access arrangements, reasonable adjustments, special consideration and malpractice - 42
545
+ Student recruitment and progression - 45
546
+ Appendix 1: Formulae – 49
547
+ Appendix 2: Notation – 53
548
+ Appendix 3: Use of calculators – 59
549
+ Appendix 4: Assessment Objectives – 60
550
+ Appendix 5: The context for the development of this qualification – 62
551
+ Appendix 6: Transferable skills – 64
552
+ Appendix 7: Level 3 Extended Project qualification – 65
553
+ Appendix 8: Codes – 67
554
+ The correct output should be:
555
+ {{
556
+ "Paper 1 and Paper 2: Pure Mathematics": [11, 29],
557
+ "Paper 3: Statistics and Mechanics": [30, 42]
558
+ }}
559
+ 2. Given this table of contents:
560
+ Qualification at a glance – 1
561
+ Assessment Objectives and weightings - 4
562
+ Knowledge, skills and understanding – 5
563
+ Theme 1: Introduction to markets and market failure - 5
564
+ Theme 2: The UK economy – performance and policies - 11
565
+ Theme 3: Business behaviour and the labour market - 21
566
+ Theme 4: A global perspective - 29
567
+ Assessment – 39
568
+ Assessment summary - 39
569
+ Assessment objectives - 41
570
+ Assessment overview - 42
571
+ Breakdown of assessment objectives - 42
572
+ Synoptic assessment - 43
573
+ Discount code and performance tables - 43
574
+ Access arrangements, reasonable adjustments and special consideration - 44
575
+ Malpractice - 45
576
+ Equality Act 2010 and Pearson equality policy - 45
577
+ Synoptic assessment - 46
578
+ Awarding and reporting - 47
579
+ Other information – 49
580
+ Student recruitment -49
581
+ Prior learning and other requirements -49
582
+ Progression - 49
583
+ Appendix 1: Transferable skills – 53
584
+ Appendix 2: Level 3 Extended Project qualification – 55
585
+ Appendix 3: Quantitative skills – 59
586
+ Appendix 4: Codes – 61
587
+ Appendix 5: Index – 63
588
+ The correct output should be:
589
+ {{
590
+ "Theme 1: Introduction to markets and market failure": [5, 10],
591
+ "Theme 2: The UK economy – performance and policies": [11, 20],
592
+ "Theme 3: Business behaviour and the labour market": [21, 28],
593
+ "Theme 4: A global perspective": [29, 38]
594
+ }}
595
+ 3. You might also see sections like:
596
+ 2.1 AS Unit 1 11
597
+ 2.2 AS Unit 2 18
598
+ 2.3 A2 Unit 3 24
599
+ 2.4 A2 Unit 4 31
600
+ In that scenario, your output might look like:
601
+ {{
602
+ "2.1 AS Unit 1": [11, 17],
603
+ "2.2 AS Unit 2": [18, 23],
604
+ "2.3 A2 Unit 3": [24, 30],
605
+ "2.4 A2 Unit 4": [31, 35]
606
+ }}
607
+ 4. Another example might list subtopics:
608
+ 3.1 Overarching themes 11
609
+ 3.2 A: Proof 12
610
+ 3.3 B: Algebra and functions 13
611
+ 3.4 C: Coordinate geometry in the ( x , y ) plane 14
612
+ 3.5 D: Sequences and series 15
613
+ 3.6 E: Trigonometry 16
614
+ 3.7 F: Exponentials and logarithms 17
615
+ 3.8 G: Differentiation 18
616
+ 3.9 H: Integration 19
617
+ 3.10 I: Numerical methods 20
618
+ 3.11 J: Vectors 20
619
+ 3.12 K: Statistical sampling 21
620
+ 3.13 L: Data presentation and interpretation 21
621
+ 3.14 M: Probability 22
622
+ 3.15 N: Statistical distributions 23
623
+ 3.16 O: Statistical hypothesis testing 23
624
+ 3.17 P: Quantities and units in mechanics 24
625
+ 3.18 Q: Kinematics 24
626
+ 3.19 R: Forces and Newton’s laws 24
627
+ 3.20 S: Moments 25
628
+ 3.21 Use of data in statistics 26
629
+ Here the correct output might look like:
630
+ {{
631
+ "A: Proof": [12, 12],
632
+ "B: Algebra and functions": [13, 13],
633
+ ...
634
+ }}
635
+ Now, extract topics from this text:
636
+ {first_pages_text}
637
+ """
638
+ global _GEMINI_CLIENT
639
+ if _GEMINI_CLIENT is None:
640
+ _GEMINI_CLIENT = genai.Client(api_key=self.api_key)
641
+ client = _GEMINI_CLIENT
642
+ try:
643
+ response = client.models.generate_content(
644
+ model="gemini-2.0-flash",
645
+ contents=[prompt],
646
+ config=types.GenerateContentConfig(temperature=0.0)
647
+ )
648
+ if not response or not response.text:
649
+ logger.warning("No text from LLM => returning empty subtopics.")
650
+ return {}
651
+ raw_json = response.text.strip()
652
+ cleaned = raw_json.replace("```json", "").replace("```", "")
653
+ try:
654
+ data = json.loads(cleaned)
655
+ except Exception as json_err:
656
+ logger.error(f"JSON parsing error: {json_err}")
657
+ return {}
658
+ final_dict = {}
659
+ found_sub_dict = None
660
+ for k, v in data.items():
661
+ if isinstance(v, dict):
662
+ found_sub_dict = v
663
+ break
664
+ if found_sub_dict is not None:
665
+ for subk, rng in found_sub_dict.items():
666
+ if isinstance(rng, list) and len(rng) == 2:
667
+ final_dict[subk] = rng
668
+ else:
669
+ for subk, rng in data.items():
670
+ if isinstance(rng, list) and len(rng) == 2:
671
+ final_dict[subk] = rng
672
+ return final_dict
673
+ except Exception as e:
674
+ logger.error(f"Gemini subtopic extraction error: {e}")
675
+ return {}
676
+
677
+ def _read_first_pages_raw(self, pdf_path: str, num_pages: int) -> str:
678
+ text_parts = []
679
+ try:
680
+ if pdf_path.startswith("http://") or pdf_path.startswith("https://"):
681
+ response = requests.get(pdf_path)
682
+ if response.status_code != 200:
683
+ logger.error("Failed to download PDF from %s. Status code: %d", pdf_path, response.status_code)
684
+ return ""
685
+ pdf_bytes = response.content
686
+ else:
687
+ with open(pdf_path, "rb") as f:
688
+ pdf_bytes = f.read()
689
+ doc = fitz.open(stream=pdf_bytes, filetype="pdf")
690
+ pages_to_read = min(num_pages, doc.page_count)
691
+ for i in range(pages_to_read):
692
+ raw_text = doc[i].get_text("raw")
693
+ text_parts.append(raw_text)
694
+ doc.close()
695
+ except Exception as e:
696
+ logger.error(f"Could not open PDF: {e}")
697
+ return "\n".join(text_parts)
698
+
699
+
700
+ class MineruNoTextProcessor:
701
+ def __init__(self, output_folder: str, gemini_api_key: str):
702
+ self.output_folder = output_folder
703
+ os.makedirs(self.output_folder, exist_ok=True)
704
+ self.layout_model = "doclayout_yolo"
705
+ self.formula_enable = True
706
+ self.table_enable = False
707
+ self.language = "en"
708
+
709
+ self.subtopic_extractor = GeminiTopicExtractor(api_key=gemini_api_key, num_pages=20)
710
+ self.gemini_api_key = gemini_api_key or os.getenv("GEMINI_API_KEY", "")
711
+
712
+ self.use_s3 = True
713
+ self.s3_writer = s3Writer(
714
+ ak=os.getenv("S3_ACCESS_KEY"),
715
+ sk=os.getenv("S3_SECRET_KEY"),
716
+ bucket="quextro-resources",
717
+ endpoint_url=os.getenv("S3_ENDPOINT")
718
+ )
719
+
720
+ def cleanup_gpu(self):
721
+ try:
722
+ gc.collect()
723
+ torch.cuda.empty_cache()
724
+ logger.info("GPU memory cleaned up.")
725
+ except Exception as e:
726
+ logger.error(f"Error during GPU cleanup: {e}")
727
+
728
+ def process(self, pdf_path: str) -> Dict[str, Any]:
729
+ logger.info(f"Processing PDF: {pdf_path}")
730
+ try:
731
+ # 1) Possibly call subtopic_extractor on first pages to find subtopics in the PDF as a whole
732
+ subtopics = self.subtopic_extractor.extract_subtopics(pdf_path)
733
+ logger.info(f"Gemini returned subtopics: {subtopics}")
734
+
735
+ if pdf_path.startswith("http://") or pdf_path.startswith("https://"):
736
+ response = requests.get(pdf_path)
737
+ if response.status_code != 200:
738
+ logger.error("Failed to download PDF from %s. Status code: %d", pdf_path, response.status_code)
739
+ raise Exception(f"Failed to download PDF: {pdf_path}")
740
+ pdf_bytes = response.content
741
+ logger.info("Downloaded %d bytes for pdf_url='%s'", len(pdf_bytes), pdf_path)
742
+ else:
743
+ with open(pdf_path, "rb") as f:
744
+ pdf_bytes = f.read()
745
+ logger.info("Loaded %d bytes from local file '%s'", len(pdf_bytes), pdf_path)
746
+
747
+ doc = fitz.open(stream=pdf_bytes, filetype="pdf")
748
+ total_pages = doc.page_count
749
+ doc.close()
750
+
751
+ # 3) Decide which pages to process
752
+ final_pages = set()
753
+ if not subtopics:
754
+ # fallback
755
+ final_pages = set(range(total_pages))
756
+ else:
757
+ offset_candidates = []
758
+ for subname, rng in subtopics.items():
759
+ start_p, _ = rng
760
+ occs = find_all_occurrences(pdf_bytes, subname)
761
+ for p in occs:
762
+ candidate = p - (start_p - 1)
763
+ if candidate > 0:
764
+ offset_candidates.append(candidate)
765
+ if offset_candidates:
766
+ try:
767
+ from statistics import mode
768
+ global_offset = mode(offset_candidates)
769
+ except:
770
+ from statistics import median
771
+ global_offset = int(median(offset_candidates))
772
+ else:
773
+ global_offset = 0
774
+
775
+ logger.info(f"Computed global offset: {global_offset}")
776
+ for subname, rng in subtopics.items():
777
+ if not (isinstance(rng, list) and len(rng) == 2):
778
+ continue
779
+ start_p, end_p = rng
780
+ if start_p > end_p:
781
+ continue
782
+ s0 = (start_p - 1) + global_offset
783
+ e0 = (end_p - 1) + global_offset
784
+ for pp in range(s0, e0 + 1):
785
+ final_pages.add(pp)
786
+
787
+ if not final_pages:
788
+ final_pages = set(range(total_pages))
789
+
790
+ logger.info(f"Processing pages (0-based): {sorted(final_pages)}")
791
+ subset_pdf_bytes = create_subset_pdf(pdf_bytes, sorted(final_pages))
792
+
793
+ # 4) Analyze and produce markdown
794
+ dataset = PymuDocDataset(subset_pdf_bytes)
795
+ inference = doc_analyze(
796
+ dataset,
797
+ ocr=True,
798
+ lang=self.language,
799
+ layout_model=self.layout_model,
800
+ formula_enable=self.formula_enable,
801
+ table_enable=self.table_enable
802
+ )
803
+ writer = S3ImageWriter(self.s3_writer, "/topic-extraction", self.gemini_api_key)
804
+ md_prefix = "/topic-extraction/"
805
+ pipe_result = inference.pipe_ocr_mode(writer, lang=self.language)
806
+ md_content = pipe_result.get_markdown(md_prefix)
807
+ final_markdown = writer.post_process(md_prefix, md_content)
808
+
809
+ subtopic_list = list(writer.extracted_subtopics.values())
810
+
811
+ out_path = os.path.join(self.output_folder, "final_subtopics.json")
812
+ with open(out_path, "w", encoding="utf-8") as f:
813
+ json.dump(subtopic_list, f, indent=2)
814
+ logger.info(f"Final subtopics JSON saved locally at {out_path}")
815
+
816
+ return {
817
+ "final_markdown": final_markdown,
818
+ "subtopics_extracted": subtopic_list
819
+ }
820
+ finally:
821
+ self.cleanup_gpu()
822
+
823
+ if __name__ == "__main__":
824
+ input_pdf = "/home/user/app/input_output/a-level-pearson-mathematics-specification.pdf"
825
+ output_dir = "/home/user/app/we"
826
+ gemini_key = os.getenv("GEMINI_API_KEY", "AIzaSyDtoakpXa2pjJwcQB6TJ5QaXHNSA5JxcrU")
827
+ try:
828
+ processor = MineruNoTextProcessor(output_folder=output_dir, gemini_api_key=gemini_key)
829
+ result = processor.process(input_pdf)
830
+ logger.info("Processing completed successfully.")
831
+ # The result includes final_markdown and subtopics_extracted
832
+ except Exception as e:
833
+ logger.error(f"Processing failed: {e}")
topic_extraction.log CHANGED
@@ -728,3 +728,467 @@ and series'. Using page 7.
728
  2025-03-02 15:59:11,730 [WARNING] __main__ - No suitable occurrence for '2.4 A2 Unit 4'. Using page 3.
729
  2025-03-02 15:59:11,730 [INFO] __main__ - Processing pages (0-based): [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34]
730
  2025-03-02 15:59:48,050 [INFO] __main__ - GPU memory cleaned up.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
728
  2025-03-02 15:59:11,730 [WARNING] __main__ - No suitable occurrence for '2.4 A2 Unit 4'. Using page 3.
729
  2025-03-02 15:59:11,730 [INFO] __main__ - Processing pages (0-based): [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34]
730
  2025-03-02 15:59:48,050 [INFO] __main__ - GPU memory cleaned up.
731
+ 2025-03-03 13:26:57,282 [INFO] __main__ - Processing PDF: /home/user/app/input_output/a-level-pearson-mathematics-specification.pdf
732
+ 2025-03-03 13:26:58,095 [INFO] __main__ - Gemini returned subtopics: {'Paper 1 and Paper 2: Pure Mathematics': [11, 29], 'Paper 3: Statistics and Mechanics': [30, 40]}
733
+ 2025-03-03 13:26:58,096 [INFO] __main__ - Loaded 1135473 bytes from local file '/home/user/app/input_output/a-level-pearson-mathematics-specification.pdf'
734
+ 2025-03-03 13:26:58,447 [INFO] __main__ - Computed global offset: 4
735
+ 2025-03-03 13:26:58,447 [INFO] __main__ - Processing pages (0-based): [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43]
736
+ 2025-03-03 13:28:51,814 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_1.jpg
737
+ 2025-03-03 13:28:53,673 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_2.jpg
738
+ 2025-03-03 13:28:54,260 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_3.jpg
739
+ 2025-03-03 13:28:54,783 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_4.jpg
740
+ 2025-03-03 13:28:55,347 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_5.jpg
741
+ 2025-03-03 13:28:55,931 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_6.jpg
742
+ 2025-03-03 13:28:56,425 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_7.jpg
743
+ 2025-03-03 13:28:57,012 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_8.jpg
744
+ 2025-03-03 13:28:57,496 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_9.jpg
745
+ 2025-03-03 13:28:58,025 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_10.jpg
746
+ 2025-03-03 13:28:58,574 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_11.jpg
747
+ 2025-03-03 13:28:59,112 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_12.jpg
748
+ 2025-03-03 13:28:59,574 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_13.jpg
749
+ 2025-03-03 13:29:00,155 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_14.jpg
750
+ 2025-03-03 13:29:00,690 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_15.jpg
751
+ 2025-03-03 13:29:01,291 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_16.jpg
752
+ 2025-03-03 13:29:01,867 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_17.jpg
753
+ 2025-03-03 13:29:02,177 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_18.jpg
754
+ 2025-03-03 13:29:02,676 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_19.jpg
755
+ 2025-03-03 13:29:03,274 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_20.jpg
756
+ 2025-03-03 13:29:03,849 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_21.jpg
757
+ 2025-03-03 13:29:04,424 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_22.jpg
758
+ 2025-03-03 13:29:04,883 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_23.jpg
759
+ 2025-03-03 13:29:05,416 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_24.jpg
760
+ 2025-03-03 13:29:05,965 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_25.jpg
761
+ 2025-03-03 13:29:06,408 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_26.jpg
762
+ 2025-03-03 13:29:06,899 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_27.jpg
763
+ 2025-03-03 13:29:07,492 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_28.jpg
764
+ 2025-03-03 13:29:07,914 [INFO] __main__ - Classifying images to detect tables.
765
+ 2025-03-03 13:29:11,945 [INFO] __main__ - Processing table image: /topic-extraction/img_1.jpg, columns=three
766
+ 2025-03-03 13:29:15,280 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_1.jpg_r0_c0.jpg
767
+ 2025-03-03 13:29:15,569 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_1.jpg_r0_c1.jpg
768
+ 2025-03-03 13:29:15,771 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_1.jpg_r1_c0.jpg
769
+ 2025-03-03 13:29:16,000 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_1.jpg_r1_c1.jpg
770
+ 2025-03-03 13:29:16,001 [ERROR] __main__ - Error processing table image /topic-extraction/img_1.jpg: No module named 'your_module'
771
+ 2025-03-03 13:29:16,002 [INFO] __main__ - Processing table image: /topic-extraction/img_2.jpg, columns=three
772
+ 2025-03-03 13:29:19,539 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_2.jpg_r0_c0.jpg
773
+ 2025-03-03 13:29:19,840 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_2.jpg_r0_c1.jpg
774
+ 2025-03-03 13:29:20,032 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_2.jpg_r1_c0.jpg
775
+ 2025-03-03 13:29:20,254 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_2.jpg_r2_c0.jpg
776
+ 2025-03-03 13:29:20,493 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_2.jpg_r3_c0.jpg
777
+ 2025-03-03 13:29:20,493 [ERROR] __main__ - Error processing table image /topic-extraction/img_2.jpg: No module named 'your_module'
778
+ 2025-03-03 13:29:20,494 [INFO] __main__ - Processing table image: /topic-extraction/img_3.jpg, columns=three
779
+ 2025-03-03 13:29:23,853 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_3.jpg_r0_c0.jpg
780
+ 2025-03-03 13:29:24,160 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_3.jpg_r0_c1.jpg
781
+ 2025-03-03 13:29:24,408 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_3.jpg_r1_c0.jpg
782
+ 2025-03-03 13:29:24,409 [ERROR] __main__ - Error processing table image /topic-extraction/img_3.jpg: No module named 'your_module'
783
+ 2025-03-03 13:29:24,410 [INFO] __main__ - Processing table image: /topic-extraction/img_4.jpg, columns=three
784
+ 2025-03-03 13:29:27,500 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_4.jpg_r0_c0.jpg
785
+ 2025-03-03 13:29:27,818 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_4.jpg_r0_c1.jpg
786
+ 2025-03-03 13:29:28,045 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_4.jpg_r1_c0.jpg
787
+ 2025-03-03 13:29:28,266 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_4.jpg_r1_c1.jpg
788
+ 2025-03-03 13:29:28,268 [ERROR] __main__ - Error processing table image /topic-extraction/img_4.jpg: No module named 'your_module'
789
+ 2025-03-03 13:29:28,268 [INFO] __main__ - Processing table image: /topic-extraction/img_5.jpg, columns=three
790
+ 2025-03-03 13:29:31,772 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_5.jpg_r0_c0.jpg
791
+ 2025-03-03 13:29:32,031 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_5.jpg_r0_c1.jpg
792
+ 2025-03-03 13:29:32,220 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_5.jpg_r1_c0.jpg
793
+ 2025-03-03 13:29:32,432 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_5.jpg_r1_c1.jpg
794
+ 2025-03-03 13:29:32,646 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_5.jpg_r2_c0.jpg
795
+ 2025-03-03 13:29:32,648 [ERROR] __main__ - Error processing table image /topic-extraction/img_5.jpg: No module named 'your_module'
796
+ 2025-03-03 13:29:32,648 [INFO] __main__ - Processing table image: /topic-extraction/img_6.jpg, columns=three
797
+ 2025-03-03 13:29:36,400 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_6.jpg_r0_c0.jpg
798
+ 2025-03-03 13:29:36,689 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_6.jpg_r0_c1.jpg
799
+ 2025-03-03 13:29:36,923 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_6.jpg_r1_c0.jpg
800
+ 2025-03-03 13:29:37,135 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_6.jpg_r1_c1.jpg
801
+ 2025-03-03 13:29:37,137 [ERROR] __main__ - Error processing table image /topic-extraction/img_6.jpg: No module named 'your_module'
802
+ 2025-03-03 13:29:37,137 [INFO] __main__ - Processing table image: /topic-extraction/img_7.jpg, columns=three
803
+ 2025-03-03 13:29:40,804 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_7.jpg_r0_c0.jpg
804
+ 2025-03-03 13:29:41,126 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_7.jpg_r0_c1.jpg
805
+ 2025-03-03 13:29:41,336 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_7.jpg_r1_c0.jpg
806
+ 2025-03-03 13:29:41,527 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_7.jpg_r2_c0.jpg
807
+ 2025-03-03 13:29:41,750 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_7.jpg_r2_c1.jpg
808
+ 2025-03-03 13:29:41,752 [ERROR] __main__ - Error processing table image /topic-extraction/img_7.jpg: No module named 'your_module'
809
+ 2025-03-03 13:29:41,752 [INFO] __main__ - Processing table image: /topic-extraction/img_8.jpg, columns=three
810
+ 2025-03-03 13:29:45,625 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r0_c0.jpg
811
+ 2025-03-03 13:29:45,929 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r0_c1.jpg
812
+ 2025-03-03 13:29:46,120 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r0_c2.jpg
813
+ 2025-03-03 13:29:46,332 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r1_c0.jpg
814
+ 2025-03-03 13:29:46,554 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r1_c1.jpg
815
+ 2025-03-03 13:29:46,785 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r1_c2.jpg
816
+ 2025-03-03 13:29:46,973 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r2_c0.jpg
817
+ 2025-03-03 13:29:47,161 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r2_c1.jpg
818
+ 2025-03-03 13:29:47,371 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r3_c0.jpg
819
+ 2025-03-03 13:29:47,604 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r3_c1.jpg
820
+ 2025-03-03 13:29:47,860 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r4_c0.jpg
821
+ 2025-03-03 13:29:48,073 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r4_c1.jpg
822
+ 2025-03-03 13:29:48,293 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r5_c0.jpg
823
+ 2025-03-03 13:29:48,527 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r5_c1.jpg
824
+ 2025-03-03 13:29:48,528 [ERROR] __main__ - Error processing table image /topic-extraction/img_8.jpg: No module named 'your_module'
825
+ 2025-03-03 13:29:48,529 [INFO] __main__ - Processing table image: /topic-extraction/img_9.jpg, columns=three
826
+ 2025-03-03 13:29:52,131 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_9.jpg_r0_c0.jpg
827
+ 2025-03-03 13:29:52,394 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_9.jpg_r0_c1.jpg
828
+ 2025-03-03 13:29:52,608 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_9.jpg_r0_c2.jpg
829
+ 2025-03-03 13:29:52,828 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_9.jpg_r1_c0.jpg
830
+ 2025-03-03 13:29:53,040 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_9.jpg_r1_c1.jpg
831
+ 2025-03-03 13:29:53,230 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_9.jpg_r2_c0.jpg
832
+ 2025-03-03 13:29:53,483 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_9.jpg_r2_c1.jpg
833
+ 2025-03-03 13:29:53,712 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_9.jpg_r3_c0.jpg
834
+ 2025-03-03 13:29:53,907 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_9.jpg_r3_c1.jpg
835
+ 2025-03-03 13:29:53,908 [ERROR] __main__ - Error processing table image /topic-extraction/img_9.jpg: No module named 'your_module'
836
+ 2025-03-03 13:29:53,908 [INFO] __main__ - Processing table image: /topic-extraction/img_10.jpg, columns=three
837
+ 2025-03-03 13:29:57,203 [INFO] __main__ - GPU memory cleaned up.
838
+ 2025-03-03 13:33:11,287 [INFO] __main__ - Processing PDF: /home/user/app/input_output/a-level-pearson-mathematics-specification.pdf
839
+ 2025-03-03 13:33:12,086 [INFO] __main__ - Gemini returned subtopics: {'Paper 1 and Paper 2: Pure Mathematics': [11, 29], 'Paper 3: Statistics and Mechanics': [30, 40]}
840
+ 2025-03-03 13:33:12,087 [INFO] __main__ - Loaded 1135473 bytes from local file '/home/user/app/input_output/a-level-pearson-mathematics-specification.pdf'
841
+ 2025-03-03 13:33:12,450 [INFO] __main__ - Computed global offset: 4
842
+ 2025-03-03 13:33:12,450 [INFO] __main__ - Processing pages (0-based): [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43]
843
+ 2025-03-03 13:34:08,590 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_1.jpg
844
+ 2025-03-03 13:34:10,443 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_2.jpg
845
+ 2025-03-03 13:34:11,003 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_3.jpg
846
+ 2025-03-03 13:34:11,551 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_4.jpg
847
+ 2025-03-03 13:34:12,104 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_5.jpg
848
+ 2025-03-03 13:34:12,646 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_6.jpg
849
+ 2025-03-03 13:34:13,132 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_7.jpg
850
+ 2025-03-03 13:34:13,599 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_8.jpg
851
+ 2025-03-03 13:34:14,153 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_9.jpg
852
+ 2025-03-03 13:34:14,847 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_10.jpg
853
+ 2025-03-03 13:34:15,304 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_11.jpg
854
+ 2025-03-03 13:34:15,831 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_12.jpg
855
+ 2025-03-03 13:34:16,257 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_13.jpg
856
+ 2025-03-03 13:34:16,888 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_14.jpg
857
+ 2025-03-03 13:34:17,395 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_15.jpg
858
+ 2025-03-03 13:34:17,984 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_16.jpg
859
+ 2025-03-03 13:34:18,529 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_17.jpg
860
+ 2025-03-03 13:34:18,824 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_18.jpg
861
+ 2025-03-03 13:34:19,296 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_19.jpg
862
+ 2025-03-03 13:34:19,924 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_20.jpg
863
+ 2025-03-03 13:34:20,496 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_21.jpg
864
+ 2025-03-03 13:34:21,104 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_22.jpg
865
+ 2025-03-03 13:34:21,531 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_23.jpg
866
+ 2025-03-03 13:34:22,061 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_24.jpg
867
+ 2025-03-03 13:34:22,624 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_25.jpg
868
+ 2025-03-03 13:34:23,058 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_26.jpg
869
+ 2025-03-03 13:34:23,603 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_27.jpg
870
+ 2025-03-03 13:34:24,201 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_28.jpg
871
+ 2025-03-03 13:34:24,606 [INFO] __main__ - Classifying images to detect tables.
872
+ 2025-03-03 13:34:28,395 [INFO] __main__ - Processing table image: /topic-extraction/img_1.jpg, columns=three
873
+ 2025-03-03 13:34:31,655 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_1.jpg_r0_c0.jpg
874
+ 2025-03-03 13:34:31,943 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_1.jpg_r0_c1.jpg
875
+ 2025-03-03 13:34:32,144 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_1.jpg_r1_c0.jpg
876
+ 2025-03-03 13:34:32,376 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_1.jpg_r1_c1.jpg
877
+ 2025-03-03 13:34:32,801 [INFO] __main__ - Processing table image: /topic-extraction/img_2.jpg, columns=three
878
+ 2025-03-03 13:34:36,018 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_2.jpg_r0_c0.jpg
879
+ 2025-03-03 13:34:36,281 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_2.jpg_r0_c1.jpg
880
+ 2025-03-03 13:34:36,504 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_2.jpg_r1_c0.jpg
881
+ 2025-03-03 13:34:36,725 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_2.jpg_r2_c0.jpg
882
+ 2025-03-03 13:34:36,916 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_2.jpg_r3_c0.jpg
883
+ 2025-03-03 13:34:37,320 [INFO] __main__ - Processing table image: /topic-extraction/img_3.jpg, columns=three
884
+ 2025-03-03 13:34:40,825 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_3.jpg_r0_c0.jpg
885
+ 2025-03-03 13:34:41,120 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_3.jpg_r0_c1.jpg
886
+ 2025-03-03 13:34:41,332 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_3.jpg_r1_c0.jpg
887
+ 2025-03-03 13:34:41,846 [INFO] __main__ - Processing table image: /topic-extraction/img_4.jpg, columns=three
888
+ 2025-03-03 13:34:44,953 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_4.jpg_r0_c0.jpg
889
+ 2025-03-03 13:34:45,248 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_4.jpg_r0_c1.jpg
890
+ 2025-03-03 13:34:45,467 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_4.jpg_r1_c0.jpg
891
+ 2025-03-03 13:34:45,695 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_4.jpg_r1_c1.jpg
892
+ 2025-03-03 13:34:46,080 [INFO] __main__ - Processing table image: /topic-extraction/img_5.jpg, columns=three
893
+ 2025-03-03 13:34:49,588 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_5.jpg_r0_c0.jpg
894
+ 2025-03-03 13:34:49,913 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_5.jpg_r0_c1.jpg
895
+ 2025-03-03 13:34:50,102 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_5.jpg_r1_c0.jpg
896
+ 2025-03-03 13:34:50,361 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_5.jpg_r1_c1.jpg
897
+ 2025-03-03 13:34:50,608 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_5.jpg_r2_c0.jpg
898
+ 2025-03-03 13:34:51,003 [INFO] __main__ - Processing table image: /topic-extraction/img_6.jpg, columns=three
899
+ 2025-03-03 13:34:54,650 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_6.jpg_r0_c0.jpg
900
+ 2025-03-03 13:34:54,948 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_6.jpg_r0_c1.jpg
901
+ 2025-03-03 13:34:55,165 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_6.jpg_r1_c0.jpg
902
+ 2025-03-03 13:34:55,384 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_6.jpg_r1_c1.jpg
903
+ 2025-03-03 13:34:55,811 [INFO] __main__ - Processing table image: /topic-extraction/img_7.jpg, columns=three
904
+ 2025-03-03 13:34:59,337 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_7.jpg_r0_c0.jpg
905
+ 2025-03-03 13:34:59,623 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_7.jpg_r0_c1.jpg
906
+ 2025-03-03 13:34:59,864 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_7.jpg_r1_c0.jpg
907
+ 2025-03-03 13:35:00,087 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_7.jpg_r2_c0.jpg
908
+ 2025-03-03 13:35:00,301 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_7.jpg_r2_c1.jpg
909
+ 2025-03-03 13:35:00,734 [INFO] __main__ - Processing table image: /topic-extraction/img_8.jpg, columns=three
910
+ 2025-03-03 13:35:04,169 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r0_c0.jpg
911
+ 2025-03-03 13:35:04,466 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r0_c1.jpg
912
+ 2025-03-03 13:35:04,693 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r0_c2.jpg
913
+ 2025-03-03 13:35:04,880 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r1_c0.jpg
914
+ 2025-03-03 13:35:05,111 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r1_c1.jpg
915
+ 2025-03-03 13:35:05,414 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r1_c2.jpg
916
+ 2025-03-03 13:35:05,607 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r2_c0.jpg
917
+ 2025-03-03 13:35:05,841 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r2_c1.jpg
918
+ 2025-03-03 13:35:06,062 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r3_c0.jpg
919
+ 2025-03-03 13:35:06,273 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r3_c1.jpg
920
+ 2025-03-03 13:35:06,497 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r4_c0.jpg
921
+ 2025-03-03 13:35:06,737 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r4_c1.jpg
922
+ 2025-03-03 13:35:06,967 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r5_c0.jpg
923
+ 2025-03-03 13:35:07,191 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r5_c1.jpg
924
+ 2025-03-03 13:35:07,636 [INFO] __main__ - Processing table image: /topic-extraction/img_9.jpg, columns=three
925
+ 2025-03-03 13:35:11,212 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_9.jpg_r0_c0.jpg
926
+ 2025-03-03 13:35:11,510 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_9.jpg_r0_c1.jpg
927
+ 2025-03-03 13:35:11,732 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_9.jpg_r0_c2.jpg
928
+ 2025-03-03 13:35:11,922 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_9.jpg_r1_c0.jpg
929
+ 2025-03-03 13:35:12,139 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_9.jpg_r1_c1.jpg
930
+ 2025-03-03 13:35:12,349 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_9.jpg_r2_c0.jpg
931
+ 2025-03-03 13:35:12,596 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_9.jpg_r2_c1.jpg
932
+ 2025-03-03 13:35:12,802 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_9.jpg_r3_c0.jpg
933
+ 2025-03-03 13:35:13,038 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_9.jpg_r3_c1.jpg
934
+ 2025-03-03 13:35:13,489 [INFO] __main__ - Processing table image: /topic-extraction/img_10.jpg, columns=three
935
+ 2025-03-03 13:35:16,908 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_10.jpg_r0_c0.jpg
936
+ 2025-03-03 13:35:17,202 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_10.jpg_r0_c1.jpg
937
+ 2025-03-03 13:35:17,420 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_10.jpg_r1_c0.jpg
938
+ 2025-03-03 13:35:17,635 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_10.jpg_r2_c0.jpg
939
+ 2025-03-03 13:35:17,860 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_10.jpg_r3_c0.jpg
940
+ 2025-03-03 13:35:18,244 [INFO] __main__ - Processing table image: /topic-extraction/img_11.jpg, columns=two
941
+ 2025-03-03 13:35:21,413 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_11.jpg_r0_c0.jpg
942
+ 2025-03-03 13:35:21,722 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_11.jpg_r1_c0.jpg
943
+ 2025-03-03 13:35:21,947 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_11.jpg_r2_c0.jpg
944
+ 2025-03-03 13:35:22,173 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_11.jpg_r3_c0.jpg
945
+ 2025-03-03 13:35:22,430 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_11.jpg_r4_c0.jpg
946
+ 2025-03-03 13:35:22,616 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_11.jpg_r5_c0.jpg
947
+ 2025-03-03 13:35:23,009 [INFO] __main__ - Processing table image: /topic-extraction/img_12.jpg, columns=three
948
+ 2025-03-03 13:35:26,169 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_12.jpg_r0_c0.jpg
949
+ 2025-03-03 13:35:26,452 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_12.jpg_r0_c1.jpg
950
+ 2025-03-03 13:35:26,674 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_12.jpg_r1_c0.jpg
951
+ 2025-03-03 13:35:26,894 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_12.jpg_r1_c1.jpg
952
+ 2025-03-03 13:35:27,104 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_12.jpg_r2_c0.jpg
953
+ 2025-03-03 13:35:27,342 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_12.jpg_r2_c1.jpg
954
+ 2025-03-03 13:35:27,736 [INFO] __main__ - Processing table image: /topic-extraction/img_13.jpg, columns=three
955
+ 2025-03-03 13:35:30,898 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_13.jpg_r0_c0.jpg
956
+ 2025-03-03 13:35:31,165 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_13.jpg_r0_c1.jpg
957
+ 2025-03-03 13:35:31,398 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_13.jpg_r1_c0.jpg
958
+ 2025-03-03 13:35:31,601 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_13.jpg_r1_c1.jpg
959
+ 2025-03-03 13:35:31,789 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_13.jpg_r2_c0.jpg
960
+ 2025-03-03 13:35:32,007 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_13.jpg_r3_c0.jpg
961
+ 2025-03-03 13:35:32,449 [INFO] __main__ - Processing table image: /topic-extraction/img_14.jpg, columns=three
962
+ 2025-03-03 13:35:36,914 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_14.jpg_r0_c0.jpg
963
+ 2025-03-03 13:35:37,204 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_14.jpg_r0_c1.jpg
964
+ 2025-03-03 13:35:37,427 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_14.jpg_r1_c0.jpg
965
+ 2025-03-03 13:35:37,643 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_14.jpg_r1_c1.jpg
966
+ 2025-03-03 13:35:37,861 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_14.jpg_r2_c0.jpg
967
+ 2025-03-03 13:35:38,087 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_14.jpg_r3_c0.jpg
968
+ 2025-03-03 13:35:38,312 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_14.jpg_r4_c0.jpg
969
+ 2025-03-03 13:35:38,536 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_14.jpg_r4_c1.jpg
970
+ 2025-03-03 13:35:38,735 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_14.jpg_r5_c0.jpg
971
+ 2025-03-03 13:35:39,132 [INFO] __main__ - Processing table image: /topic-extraction/img_15.jpg, columns=three
972
+ 2025-03-03 13:35:42,577 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_15.jpg_r0_c0.jpg
973
+ 2025-03-03 13:35:42,882 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_15.jpg_r0_c1.jpg
974
+ 2025-03-03 13:35:43,086 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_15.jpg_r1_c0.jpg
975
+ 2025-03-03 13:35:43,310 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_15.jpg_r1_c1.jpg
976
+ 2025-03-03 13:35:43,529 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_15.jpg_r2_c0.jpg
977
+ 2025-03-03 13:35:43,718 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_15.jpg_r3_c0.jpg
978
+ 2025-03-03 13:35:43,914 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_15.jpg_r4_c0.jpg
979
+ 2025-03-03 13:35:44,304 [INFO] __main__ - Processing table image: /topic-extraction/img_16.jpg, columns=three
980
+ 2025-03-03 13:35:48,617 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_16.jpg_r0_c0.jpg
981
+ 2025-03-03 13:35:48,915 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_16.jpg_r0_c1.jpg
982
+ 2025-03-03 13:35:49,124 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_16.jpg_r1_c0.jpg
983
+ 2025-03-03 13:35:49,350 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_16.jpg_r1_c1.jpg
984
+ 2025-03-03 13:35:49,578 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_16.jpg_r2_c0.jpg
985
+ 2025-03-03 13:35:49,801 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_16.jpg_r3_c0.jpg
986
+ 2025-03-03 13:35:50,013 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_16.jpg_r3_c1.jpg
987
+ 2025-03-03 13:35:50,239 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_16.jpg_r4_c0.jpg
988
+ 2025-03-03 13:35:51,085 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_16.jpg_r5_c0.jpg
989
+ 2025-03-03 13:35:51,516 [INFO] __main__ - Processing table image: /topic-extraction/img_17.jpg, columns=three
990
+ 2025-03-03 13:35:55,302 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_17.jpg_r0_c0.jpg
991
+ 2025-03-03 13:35:55,608 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_17.jpg_r0_c1.jpg
992
+ 2025-03-03 13:35:55,831 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_17.jpg_r1_c0.jpg
993
+ 2025-03-03 13:35:56,034 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_17.jpg_r2_c0.jpg
994
+ 2025-03-03 13:35:56,259 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_17.jpg_r2_c1.jpg
995
+ 2025-03-03 13:35:56,467 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_17.jpg_r3_c0.jpg
996
+ 2025-03-03 13:35:56,661 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_17.jpg_r4_c0.jpg
997
+ 2025-03-03 13:35:56,882 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_17.jpg_r5_c0.jpg
998
+ 2025-03-03 13:35:57,282 [INFO] __main__ - Processing table image: /topic-extraction/img_18.jpg, columns=three
999
+ 2025-03-03 13:35:58,547 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_18.jpg_r0_c0.jpg
1000
+ 2025-03-03 13:35:58,740 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_18.jpg_r0_c1.jpg
1001
+ 2025-03-03 13:35:58,934 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_18.jpg_r1_c0.jpg
1002
+ 2025-03-03 13:35:59,124 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_18.jpg_r1_c1.jpg
1003
+ 2025-03-03 13:35:59,543 [INFO] __main__ - Processing table image: /topic-extraction/img_19.jpg, columns=three
1004
+ 2025-03-03 13:36:02,472 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_19.jpg_r0_c0.jpg
1005
+ 2025-03-03 13:36:02,792 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_19.jpg_r0_c1.jpg
1006
+ 2025-03-03 13:36:03,024 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_19.jpg_r1_c0.jpg
1007
+ 2025-03-03 13:36:03,265 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_19.jpg_r1_c1.jpg
1008
+ 2025-03-03 13:36:03,521 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_19.jpg_r2_c0.jpg
1009
+ 2025-03-03 13:36:03,745 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_19.jpg_r2_c1.jpg
1010
+ 2025-03-03 13:36:04,162 [INFO] __main__ - Processing table image: /topic-extraction/img_20.jpg, columns=three
1011
+ 2025-03-03 13:36:09,253 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_20.jpg_r0_c0.jpg
1012
+ 2025-03-03 13:36:09,551 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_20.jpg_r0_c1.jpg
1013
+ 2025-03-03 13:36:09,791 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_20.jpg_r1_c0.jpg
1014
+ 2025-03-03 13:36:09,983 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_20.jpg_r1_c1.jpg
1015
+ 2025-03-03 13:36:10,370 [INFO] __main__ - Processing table image: /topic-extraction/img_21.jpg, columns=three
1016
+ 2025-03-03 13:36:14,808 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_21.jpg_r0_c0.jpg
1017
+ 2025-03-03 13:36:15,103 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_21.jpg_r0_c1.jpg
1018
+ 2025-03-03 13:36:15,329 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_21.jpg_r1_c0.jpg
1019
+ 2025-03-03 13:36:15,570 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_21.jpg_r1_c1.jpg
1020
+ 2025-03-03 13:36:16,020 [INFO] __main__ - Processing table image: /topic-extraction/img_22.jpg, columns=three
1021
+ 2025-03-03 13:36:20,786 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_22.jpg_r0_c0.jpg
1022
+ 2025-03-03 13:36:21,053 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_22.jpg_r0_c1.jpg
1023
+ 2025-03-03 13:36:21,295 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_22.jpg_r1_c0.jpg
1024
+ 2025-03-03 13:36:21,539 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_22.jpg_r1_c1.jpg
1025
+ 2025-03-03 13:36:21,753 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_22.jpg_r2_c0.jpg
1026
+ 2025-03-03 13:36:21,966 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_22.jpg_r2_c1.jpg
1027
+ 2025-03-03 13:36:22,203 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_22.jpg_r3_c0.jpg
1028
+ 2025-03-03 13:36:22,623 [INFO] __main__ - Processing table image: /topic-extraction/img_23.jpg, columns=three
1029
+ 2025-03-03 13:36:25,781 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_23.jpg_r0_c0.jpg
1030
+ 2025-03-03 13:36:26,112 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_23.jpg_r0_c1.jpg
1031
+ 2025-03-03 13:36:26,300 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_23.jpg_r1_c0.jpg
1032
+ 2025-03-03 13:36:26,540 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_23.jpg_r1_c1.jpg
1033
+ 2025-03-03 13:36:26,953 [INFO] __main__ - Processing table image: /topic-extraction/img_24.jpg, columns=three
1034
+ 2025-03-03 13:36:30,871 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_24.jpg_r0_c0.jpg
1035
+ 2025-03-03 13:36:31,169 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_24.jpg_r0_c1.jpg
1036
+ 2025-03-03 13:36:31,360 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_24.jpg_r1_c0.jpg
1037
+ 2025-03-03 13:36:31,586 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_24.jpg_r1_c1.jpg
1038
+ 2025-03-03 13:36:31,839 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_24.jpg_r2_c0.jpg
1039
+ 2025-03-03 13:36:32,233 [INFO] __main__ - Processing table image: /topic-extraction/img_25.jpg, columns=two
1040
+ 2025-03-03 13:36:35,440 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_25.jpg_r0_c0.jpg
1041
+ 2025-03-03 13:36:35,737 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_25.jpg_r1_c0.jpg
1042
+ 2025-03-03 13:36:35,958 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_25.jpg_r2_c0.jpg
1043
+ 2025-03-03 13:36:36,186 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_25.jpg_r3_c0.jpg
1044
+ 2025-03-03 13:36:36,417 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_25.jpg_r4_c0.jpg
1045
+ 2025-03-03 13:36:36,615 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_25.jpg_r5_c0.jpg
1046
+ 2025-03-03 13:36:37,031 [INFO] __main__ - Processing table image: /topic-extraction/img_26.jpg, columns=three
1047
+ 2025-03-03 13:36:40,174 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_26.jpg_r0_c0.jpg
1048
+ 2025-03-03 13:36:40,461 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_26.jpg_r0_c1.jpg
1049
+ 2025-03-03 13:36:40,681 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_26.jpg_r1_c0.jpg
1050
+ 2025-03-03 13:36:40,872 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_26.jpg_r2_c0.jpg
1051
+ 2025-03-03 13:36:41,298 [INFO] __main__ - Processing table image: /topic-extraction/img_27.jpg, columns=three
1052
+ 2025-03-03 13:36:44,242 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_27.jpg_r0_c0.jpg
1053
+ 2025-03-03 13:36:44,546 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_27.jpg_r0_c1.jpg
1054
+ 2025-03-03 13:36:44,735 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_27.jpg_r1_c0.jpg
1055
+ 2025-03-03 13:36:44,958 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_27.jpg_r1_c1.jpg
1056
+ 2025-03-03 13:36:45,189 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_27.jpg_r2_c0.jpg
1057
+ 2025-03-03 13:36:45,401 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_27.jpg_r3_c0.jpg
1058
+ 2025-03-03 13:36:45,632 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_27.jpg_r4_c0.jpg
1059
+ 2025-03-03 13:36:45,853 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_27.jpg_r4_c1.jpg
1060
+ 2025-03-03 13:36:46,263 [INFO] __main__ - Processing table image: /topic-extraction/img_28.jpg, columns=two
1061
+ 2025-03-03 13:36:49,648 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_28.jpg_r0_c0.jpg
1062
+ 2025-03-03 13:36:49,911 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_28.jpg_r1_c0.jpg
1063
+ 2025-03-03 13:36:50,133 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_28.jpg_r2_c0.jpg
1064
+ 2025-03-03 13:36:50,350 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_28.jpg_r3_c0.jpg
1065
+ 2025-03-03 13:36:50,760 [INFO] __main__ - Final subtopics JSON saved locally at /home/user/app/we/final_subtopics.json
1066
+ 2025-03-03 13:36:51,086 [INFO] __main__ - GPU memory cleaned up.
1067
+ 2025-03-03 13:36:51,093 [INFO] __main__ - Processing completed successfully.
1068
+ 2025-03-03 14:05:17,866 [INFO] __main__ - Processing PDF: /home/user/app/input_output/a-level-pearson-mathematics-specification.pdf
1069
+ 2025-03-03 14:05:18,700 [INFO] __main__ - Gemini returned subtopics: {'Paper 1 and Paper 2: Pure Mathematics': [11, 29], 'Paper 3: Statistics and Mechanics': [30, 40]}
1070
+ 2025-03-03 14:05:18,702 [INFO] __main__ - Loaded 1135473 bytes from local file '/home/user/app/input_output/a-level-pearson-mathematics-specification.pdf'
1071
+ 2025-03-03 14:05:19,046 [INFO] __main__ - Computed global offset: 4
1072
+ 2025-03-03 14:05:19,047 [INFO] __main__ - Processing pages (0-based): [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43]
1073
+ 2025-03-03 14:05:52,370 [INFO] __main__ - GPU memory cleaned up.
1074
+ 2025-03-03 14:10:28,391 [INFO] __main__ - Processing PDF: /home/user/app/input_output/a-level-pearson-mathematics-specification.pdf
1075
+ 2025-03-03 14:10:29,161 [INFO] __main__ - Gemini returned subtopics: {'Paper 1 and Paper 2: Pure Mathematics': [11, 29], 'Paper 3: Statistics and Mechanics': [30, 40]}
1076
+ 2025-03-03 14:10:29,162 [INFO] __main__ - Loaded 1135473 bytes from local file '/home/user/app/input_output/a-level-pearson-mathematics-specification.pdf'
1077
+ 2025-03-03 14:10:29,484 [INFO] __main__ - Computed global offset: 4
1078
+ 2025-03-03 14:10:29,484 [INFO] __main__ - Processing pages (0-based): [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43]
1079
+ 2025-03-03 14:11:29,432 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_1.jpg
1080
+ 2025-03-03 14:11:31,185 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_2.jpg
1081
+ 2025-03-03 14:11:31,702 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_3.jpg
1082
+ 2025-03-03 14:11:32,212 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_4.jpg
1083
+ 2025-03-03 14:11:32,763 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_5.jpg
1084
+ 2025-03-03 14:11:33,406 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_6.jpg
1085
+ 2025-03-03 14:11:33,898 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_7.jpg
1086
+ 2025-03-03 14:11:34,395 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_8.jpg
1087
+ 2025-03-03 14:11:34,878 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_9.jpg
1088
+ 2025-03-03 14:11:35,417 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_10.jpg
1089
+ 2025-03-03 14:11:35,838 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_11.jpg
1090
+ 2025-03-03 14:11:36,385 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_12.jpg
1091
+ 2025-03-03 14:11:36,971 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_13.jpg
1092
+ 2025-03-03 14:11:37,669 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_14.jpg
1093
+ 2025-03-03 14:11:38,314 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_15.jpg
1094
+ 2025-03-03 14:11:38,926 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_16.jpg
1095
+ 2025-03-03 14:11:39,484 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_17.jpg
1096
+ 2025-03-03 14:11:39,846 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_18.jpg
1097
+ 2025-03-03 14:11:40,381 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_19.jpg
1098
+ 2025-03-03 14:11:40,979 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_20.jpg
1099
+ 2025-03-03 14:11:41,538 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_21.jpg
1100
+ 2025-03-03 14:11:42,104 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_22.jpg
1101
+ 2025-03-03 14:11:42,640 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_23.jpg
1102
+ 2025-03-03 14:11:43,153 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_24.jpg
1103
+ 2025-03-03 14:11:43,663 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_25.jpg
1104
+ 2025-03-03 14:11:44,172 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_26.jpg
1105
+ 2025-03-03 14:11:44,677 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_27.jpg
1106
+ 2025-03-03 14:11:45,255 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_28.jpg
1107
+ 2025-03-03 14:11:45,680 [INFO] __main__ - Classifying images to detect tables.
1108
+ 2025-03-03 14:11:49,521 [INFO] __main__ - Processing table image: /topic-extraction/img_1.jpg, columns=three
1109
+ 2025-03-03 14:11:52,761 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_1.jpg_r0_c0.jpg
1110
+ 2025-03-03 14:11:52,762 [ERROR] __main__ - Error processing table image /topic-extraction/img_1.jpg: 'TableExtractor' object has no attribute 'crop_cell'
1111
+ 2025-03-03 14:11:52,762 [INFO] __main__ - Processing table image: /topic-extraction/img_2.jpg, columns=three
1112
+ 2025-03-03 14:11:56,000 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_2.jpg_r0_c0.jpg
1113
+ 2025-03-03 14:11:56,000 [ERROR] __main__ - Error processing table image /topic-extraction/img_2.jpg: 'TableExtractor' object has no attribute 'crop_cell'
1114
+ 2025-03-03 14:11:56,000 [INFO] __main__ - Processing table image: /topic-extraction/img_3.jpg, columns=three
1115
+ 2025-03-03 14:11:59,555 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_3.jpg_r0_c0.jpg
1116
+ 2025-03-03 14:11:59,555 [ERROR] __main__ - Error processing table image /topic-extraction/img_3.jpg: 'TableExtractor' object has no attribute 'crop_cell'
1117
+ 2025-03-03 14:11:59,556 [INFO] __main__ - Processing table image: /topic-extraction/img_4.jpg, columns=three
1118
+ 2025-03-03 14:12:02,696 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_4.jpg_r0_c0.jpg
1119
+ 2025-03-03 14:12:02,697 [ERROR] __main__ - Error processing table image /topic-extraction/img_4.jpg: 'TableExtractor' object has no attribute 'crop_cell'
1120
+ 2025-03-03 14:12:02,697 [INFO] __main__ - Processing table image: /topic-extraction/img_5.jpg, columns=three
1121
+ 2025-03-03 14:12:06,308 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_5.jpg_r0_c0.jpg
1122
+ 2025-03-03 14:12:06,309 [ERROR] __main__ - Error processing table image /topic-extraction/img_5.jpg: 'TableExtractor' object has no attribute 'crop_cell'
1123
+ 2025-03-03 14:12:06,309 [INFO] __main__ - Processing table image: /topic-extraction/img_6.jpg, columns=three
1124
+ 2025-03-03 14:12:10,140 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_6.jpg_r0_c0.jpg
1125
+ 2025-03-03 14:12:10,141 [ERROR] __main__ - Error processing table image /topic-extraction/img_6.jpg: 'TableExtractor' object has no attribute 'crop_cell'
1126
+ 2025-03-03 14:12:10,141 [INFO] __main__ - Processing table image: /topic-extraction/img_7.jpg, columns=three
1127
+ 2025-03-03 14:12:13,713 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_7.jpg_r0_c0.jpg
1128
+ 2025-03-03 14:12:13,713 [ERROR] __main__ - Error processing table image /topic-extraction/img_7.jpg: 'TableExtractor' object has no attribute 'crop_cell'
1129
+ 2025-03-03 14:12:13,713 [INFO] __main__ - Processing table image: /topic-extraction/img_8.jpg, columns=three
1130
+ 2025-03-03 14:12:17,306 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r0_c0.jpg
1131
+ 2025-03-03 14:12:17,306 [ERROR] __main__ - Error processing table image /topic-extraction/img_8.jpg: 'TableExtractor' object has no attribute 'crop_cell'
1132
+ 2025-03-03 14:12:17,306 [INFO] __main__ - Processing table image: /topic-extraction/img_9.jpg, columns=three
1133
+ 2025-03-03 14:12:21,354 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_9.jpg_r0_c0.jpg
1134
+ 2025-03-03 14:12:21,354 [ERROR] __main__ - Error processing table image /topic-extraction/img_9.jpg: 'TableExtractor' object has no attribute 'crop_cell'
1135
+ 2025-03-03 14:12:21,355 [INFO] __main__ - Processing table image: /topic-extraction/img_10.jpg, columns=three
1136
+ 2025-03-03 14:12:24,668 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_10.jpg_r0_c0.jpg
1137
+ 2025-03-03 14:12:24,668 [ERROR] __main__ - Error processing table image /topic-extraction/img_10.jpg: 'TableExtractor' object has no attribute 'crop_cell'
1138
+ 2025-03-03 14:12:24,668 [INFO] __main__ - Processing table image: /topic-extraction/img_11.jpg, columns=two
1139
+ 2025-03-03 14:12:27,914 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_11.jpg_r0_c0.jpg
1140
+ 2025-03-03 14:12:27,914 [ERROR] __main__ - Error processing table image /topic-extraction/img_11.jpg: 'TableExtractor' object has no attribute 'crop_cell'
1141
+ 2025-03-03 14:12:27,915 [INFO] __main__ - Processing table image: /topic-extraction/img_12.jpg, columns=three
1142
+ 2025-03-03 14:12:31,345 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_12.jpg_r0_c0.jpg
1143
+ 2025-03-03 14:12:31,346 [ERROR] __main__ - Error processing table image /topic-extraction/img_12.jpg: 'TableExtractor' object has no attribute 'crop_cell'
1144
+ 2025-03-03 14:12:31,346 [INFO] __main__ - Processing table image: /topic-extraction/img_13.jpg, columns=three
1145
+ 2025-03-03 14:12:34,536 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_13.jpg_r0_c0.jpg
1146
+ 2025-03-03 14:12:34,536 [ERROR] __main__ - Error processing table image /topic-extraction/img_13.jpg: 'TableExtractor' object has no attribute 'crop_cell'
1147
+ 2025-03-03 14:12:34,536 [INFO] __main__ - Processing table image: /topic-extraction/img_14.jpg, columns=three
1148
+ 2025-03-03 14:12:39,055 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_14.jpg_r0_c0.jpg
1149
+ 2025-03-03 14:12:39,056 [ERROR] __main__ - Error processing table image /topic-extraction/img_14.jpg: 'TableExtractor' object has no attribute 'crop_cell'
1150
+ 2025-03-03 14:12:39,056 [INFO] __main__ - Processing table image: /topic-extraction/img_15.jpg, columns=three
1151
+ 2025-03-03 14:12:43,762 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_15.jpg_r0_c0.jpg
1152
+ 2025-03-03 14:12:43,763 [ERROR] __main__ - Error processing table image /topic-extraction/img_15.jpg: 'TableExtractor' object has no attribute 'crop_cell'
1153
+ 2025-03-03 14:12:43,763 [INFO] __main__ - Processing table image: /topic-extraction/img_16.jpg, columns=three
1154
+ 2025-03-03 14:12:48,110 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_16.jpg_r0_c0.jpg
1155
+ 2025-03-03 14:12:48,110 [ERROR] __main__ - Error processing table image /topic-extraction/img_16.jpg: 'TableExtractor' object has no attribute 'crop_cell'
1156
+ 2025-03-03 14:12:48,110 [INFO] __main__ - Processing table image: /topic-extraction/img_17.jpg, columns=three
1157
+ 2025-03-03 14:12:52,283 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_17.jpg_r0_c0.jpg
1158
+ 2025-03-03 14:12:52,283 [ERROR] __main__ - Error processing table image /topic-extraction/img_17.jpg: 'TableExtractor' object has no attribute 'crop_cell'
1159
+ 2025-03-03 14:12:52,284 [INFO] __main__ - Processing table image: /topic-extraction/img_18.jpg, columns=three
1160
+ 2025-03-03 14:12:53,748 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_18.jpg_r0_c0.jpg
1161
+ 2025-03-03 14:12:53,748 [ERROR] __main__ - Error processing table image /topic-extraction/img_18.jpg: 'TableExtractor' object has no attribute 'crop_cell'
1162
+ 2025-03-03 14:12:53,749 [INFO] __main__ - Processing table image: /topic-extraction/img_19.jpg, columns=three
1163
+ 2025-03-03 14:12:57,191 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_19.jpg_r0_c0.jpg
1164
+ 2025-03-03 14:12:57,191 [ERROR] __main__ - Error processing table image /topic-extraction/img_19.jpg: 'TableExtractor' object has no attribute 'crop_cell'
1165
+ 2025-03-03 14:12:57,192 [INFO] __main__ - Processing table image: /topic-extraction/img_20.jpg, columns=three
1166
+ 2025-03-03 14:13:01,859 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_20.jpg_r0_c0.jpg
1167
+ 2025-03-03 14:13:01,862 [ERROR] __main__ - Error processing table image /topic-extraction/img_20.jpg: 'TableExtractor' object has no attribute 'crop_cell'
1168
+ 2025-03-03 14:13:01,862 [INFO] __main__ - Processing table image: /topic-extraction/img_21.jpg, columns=three
1169
+ 2025-03-03 14:13:05,555 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_21.jpg_r0_c0.jpg
1170
+ 2025-03-03 14:13:05,556 [ERROR] __main__ - Error processing table image /topic-extraction/img_21.jpg: 'TableExtractor' object has no attribute 'crop_cell'
1171
+ 2025-03-03 14:13:05,556 [INFO] __main__ - Processing table image: /topic-extraction/img_22.jpg, columns=three
1172
+ 2025-03-03 14:13:10,335 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_22.jpg_r0_c0.jpg
1173
+ 2025-03-03 14:13:10,336 [ERROR] __main__ - Error processing table image /topic-extraction/img_22.jpg: 'TableExtractor' object has no attribute 'crop_cell'
1174
+ 2025-03-03 14:13:10,336 [INFO] __main__ - Processing table image: /topic-extraction/img_23.jpg, columns=three
1175
+ 2025-03-03 14:13:13,658 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_23.jpg_r0_c0.jpg
1176
+ 2025-03-03 14:13:13,658 [ERROR] __main__ - Error processing table image /topic-extraction/img_23.jpg: 'TableExtractor' object has no attribute 'crop_cell'
1177
+ 2025-03-03 14:13:13,659 [INFO] __main__ - Processing table image: /topic-extraction/img_24.jpg, columns=three
1178
+ 2025-03-03 14:13:17,286 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_24.jpg_r0_c0.jpg
1179
+ 2025-03-03 14:13:17,287 [ERROR] __main__ - Error processing table image /topic-extraction/img_24.jpg: 'TableExtractor' object has no attribute 'crop_cell'
1180
+ 2025-03-03 14:13:17,287 [INFO] __main__ - Processing table image: /topic-extraction/img_25.jpg, columns=two
1181
+ 2025-03-03 14:13:20,933 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_25.jpg_r0_c0.jpg
1182
+ 2025-03-03 14:13:20,933 [ERROR] __main__ - Error processing table image /topic-extraction/img_25.jpg: 'TableExtractor' object has no attribute 'crop_cell'
1183
+ 2025-03-03 14:13:20,933 [INFO] __main__ - Processing table image: /topic-extraction/img_26.jpg, columns=three
1184
+ 2025-03-03 14:13:23,943 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_26.jpg_r0_c0.jpg
1185
+ 2025-03-03 14:13:23,943 [ERROR] __main__ - Error processing table image /topic-extraction/img_26.jpg: 'TableExtractor' object has no attribute 'crop_cell'
1186
+ 2025-03-03 14:13:23,944 [INFO] __main__ - Processing table image: /topic-extraction/img_27.jpg, columns=three
1187
+ 2025-03-03 14:13:27,633 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_27.jpg_r0_c0.jpg
1188
+ 2025-03-03 14:13:27,633 [ERROR] __main__ - Error processing table image /topic-extraction/img_27.jpg: 'TableExtractor' object has no attribute 'crop_cell'
1189
+ 2025-03-03 14:13:27,633 [INFO] __main__ - Processing table image: /topic-extraction/img_28.jpg, columns=two
1190
+ 2025-03-03 14:13:31,099 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_28.jpg_r0_c0.jpg
1191
+ 2025-03-03 14:13:31,100 [ERROR] __main__ - Error processing table image /topic-extraction/img_28.jpg: 'TableExtractor' object has no attribute 'crop_cell'
1192
+ 2025-03-03 14:13:31,102 [INFO] __main__ - Final subtopics JSON saved locally at /home/user/app/we/final_subtopics.json
1193
+ 2025-03-03 14:13:31,434 [INFO] __main__ - GPU memory cleaned up.
1194
+ 2025-03-03 14:13:31,442 [INFO] __main__ - Processing completed successfully.
we/final_subtopics.json ADDED
@@ -0,0 +1 @@
 
 
1
+ []