Arsenii11 commited on
Commit
e8ef287
·
1 Parent(s): 85bcd32
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. __pycache__/topic_extraction.cpython-310.pyc +0 -0
  2. pearson_json/final_subtopics.json +142 -0
  3. topic_extr.py +233 -72
  4. topic_extract_arsenii.py +883 -0
  5. topic_extraction.log +0 -0
  6. topic_extraction.py +1 -1
  7. topic_extraction_ars.log +460 -0
  8. we/final_subtopics.json +282 -1
  9. we/we_ars/final_subtopics.json +282 -0
  10. wje/final_output.json +265 -0
  11. wje/final_output_local.json +265 -0
  12. wje/img_1.jpg_rows/row_0/col_0.png +0 -0
  13. wje/img_1.jpg_rows/row_0/col_1.png +0 -0
  14. wje/img_1.jpg_rows/row_1/col_0.png +0 -0
  15. wje/img_1.jpg_rows/row_1/col_1.png +0 -0
  16. wje/img_10.jpg_rows/row_0/col_0.png +0 -0
  17. wje/img_10.jpg_rows/row_0/col_1.png +0 -0
  18. wje/img_10.jpg_rows/row_1/col_0.png +0 -0
  19. wje/img_10.jpg_rows/row_2/col_0.png +0 -0
  20. wje/img_10.jpg_rows/row_3/col_0.png +0 -0
  21. wje/img_11.jpg_rows/row_0/col_0.png +0 -0
  22. wje/img_11.jpg_rows/row_1/col_0.png +0 -0
  23. wje/img_11.jpg_rows/row_2/col_0.png +0 -0
  24. wje/img_11.jpg_rows/row_3/col_0.png +0 -0
  25. wje/img_11.jpg_rows/row_4/col_0.png +0 -0
  26. wje/img_11.jpg_rows/row_5/col_0.png +0 -0
  27. wje/img_12.jpg_rows/row_0/col_0.png +0 -0
  28. wje/img_12.jpg_rows/row_0/col_1.png +0 -0
  29. wje/img_12.jpg_rows/row_1/col_0.png +0 -0
  30. wje/img_12.jpg_rows/row_1/col_1.png +0 -0
  31. wje/img_12.jpg_rows/row_2/col_0.png +0 -0
  32. wje/img_12.jpg_rows/row_2/col_1.png +0 -0
  33. wje/img_13.jpg_rows/row_0/col_0.png +0 -0
  34. wje/img_13.jpg_rows/row_0/col_1.png +0 -0
  35. wje/img_13.jpg_rows/row_1/col_0.png +0 -0
  36. wje/img_13.jpg_rows/row_1/col_1.png +0 -0
  37. wje/img_13.jpg_rows/row_2/col_0.png +0 -0
  38. wje/img_13.jpg_rows/row_3/col_0.png +0 -0
  39. wje/img_14.jpg_rows/row_0/col_0.png +0 -0
  40. wje/img_14.jpg_rows/row_0/col_1.png +0 -0
  41. wje/img_14.jpg_rows/row_1/col_0.png +0 -0
  42. wje/img_14.jpg_rows/row_1/col_1.png +0 -0
  43. wje/img_14.jpg_rows/row_2/col_0.png +0 -0
  44. wje/img_14.jpg_rows/row_3/col_0.png +0 -0
  45. wje/img_14.jpg_rows/row_4/col_0.png +0 -0
  46. wje/img_14.jpg_rows/row_4/col_1.png +0 -0
  47. wje/img_14.jpg_rows/row_5/col_0.png +0 -0
  48. wje/img_15.jpg_rows/row_0/col_0.png +0 -0
  49. wje/img_15.jpg_rows/row_0/col_1.png +0 -0
  50. wje/img_15.jpg_rows/row_1/col_0.png +0 -0
__pycache__/topic_extraction.cpython-310.pyc CHANGED
Binary files a/__pycache__/topic_extraction.cpython-310.pyc and b/__pycache__/topic_extraction.cpython-310.pyc differ
 
pearson_json/final_subtopics.json ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "title": "",
4
+ "contents": [],
5
+ "children": []
6
+ },
7
+ {
8
+ "title": "",
9
+ "contents": [],
10
+ "children": []
11
+ },
12
+ {
13
+ "title": "",
14
+ "contents": [],
15
+ "children": []
16
+ },
17
+ {
18
+ "title": "",
19
+ "contents": [],
20
+ "children": []
21
+ },
22
+ {
23
+ "title": "",
24
+ "contents": [],
25
+ "children": []
26
+ },
27
+ {
28
+ "title": "",
29
+ "contents": [],
30
+ "children": []
31
+ },
32
+ {
33
+ "title": "",
34
+ "contents": [],
35
+ "children": []
36
+ },
37
+ {
38
+ "title": "",
39
+ "contents": [],
40
+ "children": []
41
+ },
42
+ {
43
+ "title": "",
44
+ "contents": [],
45
+ "children": []
46
+ },
47
+ {
48
+ "title": "",
49
+ "contents": [],
50
+ "children": []
51
+ },
52
+ {
53
+ "title": "",
54
+ "contents": [],
55
+ "children": []
56
+ },
57
+ {
58
+ "title": "",
59
+ "contents": [],
60
+ "children": []
61
+ },
62
+ {
63
+ "title": "",
64
+ "contents": [],
65
+ "children": []
66
+ },
67
+ {
68
+ "title": "",
69
+ "contents": [],
70
+ "children": []
71
+ },
72
+ {
73
+ "title": "",
74
+ "contents": [],
75
+ "children": []
76
+ },
77
+ {
78
+ "title": "",
79
+ "contents": [],
80
+ "children": []
81
+ },
82
+ {
83
+ "title": "",
84
+ "contents": [],
85
+ "children": []
86
+ },
87
+ {
88
+ "title": "",
89
+ "contents": [],
90
+ "children": []
91
+ },
92
+ {
93
+ "title": "",
94
+ "contents": [],
95
+ "children": []
96
+ },
97
+ {
98
+ "title": "",
99
+ "contents": [],
100
+ "children": []
101
+ },
102
+ {
103
+ "title": "",
104
+ "contents": [],
105
+ "children": []
106
+ },
107
+ {
108
+ "title": "",
109
+ "contents": [],
110
+ "children": []
111
+ },
112
+ {
113
+ "title": "",
114
+ "contents": [],
115
+ "children": []
116
+ },
117
+ {
118
+ "title": "",
119
+ "contents": [],
120
+ "children": []
121
+ },
122
+ {
123
+ "title": "",
124
+ "contents": [],
125
+ "children": []
126
+ },
127
+ {
128
+ "title": "",
129
+ "contents": [],
130
+ "children": []
131
+ },
132
+ {
133
+ "title": "",
134
+ "contents": [],
135
+ "children": []
136
+ },
137
+ {
138
+ "title": "",
139
+ "contents": [],
140
+ "children": []
141
+ }
142
+ ]
topic_extr.py CHANGED
@@ -169,36 +169,149 @@ async def classify_image_async(image_data: bytes, api_key: str, max_retries: int
169
  return await loop.run_in_executor(None, call_gemini_for_table_classification, preprocessed, api_key, max_retries)
170
 
171
 
172
- def call_gemini_for_subtopic_identification(text: str, api_key: str, max_retries: int = 1) -> dict:
173
- """
174
- Sends the recognized text from a specification table to Gemini,
175
- asking it to identify the main topic (like '2 Algebra and functions')
176
- and subtopics (like '2.5', '3.4', etc.).
177
-
178
- Returns a dict of the form:
179
- {
180
- "title": "2 Algebra and functions",
181
- "subtopics": ["2.5", "2.6", ...]
182
- }
183
-
184
- If Gemini can't find anything, it might return empty strings or lists.
185
- """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
186
  for attempt in range(max_retries + 1):
187
  try:
188
- prompt = f"""
189
- You are given text extracted from a table that represents topics and subtopics from an educational curriculum.
190
- The text may include a main topic heading in the format: "<number> <Topic Name>", for example, "2 Algebra and functions".
191
- It may also include subtopics in the format of "<number>.<number>", such as "2.5", "3.4", etc.
192
- Extract and output a valid JSON object with exactly two keys:
193
- - "title": the main topic heading (if found). If not found, use an empty string.
194
- - "subtopics": an array of strings representing each subtopic number extracted from the text.
195
- Output exactly in this JSON format with no additional text. For example:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
196
  {
197
- "title": "2 Algebra and functions",
198
- "subtopics": ["2.5", "2.6"]
199
  }
200
- Text:
201
- {text}
202
  """
203
  global _GEMINI_CLIENT
204
  if _GEMINI_CLIENT is None:
@@ -207,36 +320,45 @@ Text:
207
 
208
  resp = client.models.generate_content(
209
  model="gemini-2.0-flash",
210
- contents=[prompt],
 
 
 
 
 
 
 
 
 
 
 
 
211
  config=types.GenerateContentConfig(temperature=0.0)
212
  )
213
-
 
 
214
  if not resp or not resp.text:
215
- # If Gemini gives no response, fallback to empty.
216
  return {"title": "", "subtopics": []}
217
 
218
  raw = resp.text.strip()
219
-
220
- # Attempt to parse raw as JSON
221
- try:
222
- data = json.loads(raw)
223
- # Guarantee the structure we want
224
- title = data.get("title", "")
225
- subs = data.get("subtopics", [])
226
- if not isinstance(subs, list):
227
- subs = []
228
- return {"title": title, "subtopics": subs}
229
- except Exception:
230
- # If JSON parse fails, return empty
231
- return {"title": "", "subtopics": []}
232
 
233
  except Exception as e:
234
- # If there's an error or a 503, we can retry or bail out
235
  if attempt < max_retries:
236
  time.sleep(0.5)
237
  else:
238
  return {"title": "", "subtopics": []}
239
 
 
 
240
 
241
 
242
  class S3ImageWriter(DataWriter):
@@ -314,11 +436,13 @@ class S3ImageWriter(DataWriter):
314
  logger.warning(f"No image data found for S3 key {s3_key}. Skipping.")
315
  continue
316
 
 
317
  with tempfile.NamedTemporaryFile(delete=False, suffix=".jpg") as temp_file:
318
  temp_file.write(img_data)
319
  temp_path = temp_file.name
320
 
321
  try:
 
322
  if col_type.lower() == 'two':
323
  extractor = TableExtractor(
324
  skip_header=True,
@@ -334,42 +458,77 @@ class S3ImageWriter(DataWriter):
334
  subtopic_threshold=0.2
335
  )
336
  row_boxes = extractor.process_image(temp_path)
337
-
338
- snippet = ["**Extracted table cells:**"]
339
- cell_texts = []
340
  for i, row in enumerate(row_boxes):
341
- for j, box in enumerate(row):
342
- cell_key = f"{self.base_path}cells/table_s3_{os.path.basename(s3_key)}_r{i}_c{j}.jpg"
343
- self.s3_writer.write(cell_key, img_data) # or cell_data if you truly cropped
344
 
345
- text = "..." # placeholder
346
- cell_texts.append(text)
347
-
348
- snippet.append(f"![Row {i} Col {j}]({cell_key})")
349
 
350
- combined_text = "\n".join(cell_texts)
 
 
 
351
 
352
- subtopic_info = call_gemini_for_subtopic_identification(combined_text, self.gemini_api_key)
 
 
353
 
354
- # subtopic_info might be: {"title": "2 Algebra and functions", "subtopics": ["2.5"]}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
355
  final_json = {
356
- "title": subtopic_info.get("title", ""),
357
- "contents": [
358
- {"type": "image", "key": s3_key}
359
- ],
360
- "children": []
361
  }
362
- for st in subtopic_info.get("subtopics", []):
363
- final_json["children"].append({
364
- "title": st,
365
- "contents": [
366
- {"type": "image", "key": f"subtopic_{st}_example.jpg"}
367
- ]
368
- })
369
 
 
370
  self.extracted_subtopics[s3_key] = final_json
371
 
372
- # Replace the original table image line in the markdown with the snippet
 
 
 
 
373
  new_snip = "\n".join(snippet)
374
  old_line = f"![HAS TO BE PROCESSED - {col_type} column table]({s3_key})"
375
  md_content = md_content.replace(old_line, new_snip)
@@ -381,6 +540,8 @@ class S3ImageWriter(DataWriter):
381
 
382
  return md_content
383
 
 
 
384
  def post_process(self, key: str, md_content: str) -> str:
385
  return asyncio.run(self.post_process_async(key, md_content))
386
 
@@ -457,7 +618,7 @@ class LocalImageWriter(DataWriter):
457
  with open(temp_path, "wb") as f:
458
  f.write(desc_item["data"])
459
  try:
460
- if col_type.lower() == 'two':
461
  extractor = TableExtractor(
462
  skip_header=True,
463
  merge_two_col_rows=True,
@@ -822,7 +983,7 @@ class MineruNoTextProcessor:
822
 
823
  if __name__ == "__main__":
824
  input_pdf = "/home/user/app/input_output/a-level-pearson-mathematics-specification.pdf"
825
- output_dir = "/home/user/app/we"
826
  gemini_key = os.getenv("GEMINI_API_KEY", "AIzaSyDtoakpXa2pjJwcQB6TJ5QaXHNSA5JxcrU")
827
  try:
828
  processor = MineruNoTextProcessor(output_folder=output_dir, gemini_api_key=gemini_key)
@@ -830,4 +991,4 @@ if __name__ == "__main__":
830
  logger.info("Processing completed successfully.")
831
  # The result includes final_markdown and subtopics_extracted
832
  except Exception as e:
833
- logger.error(f"Processing failed: {e}")
 
169
  return await loop.run_in_executor(None, call_gemini_for_table_classification, preprocessed, api_key, max_retries)
170
 
171
 
172
+ # def call_gemini_for_subtopic_identification_image(image_data: bytes, api_key: str, max_retries: int = 1) -> dict:
173
+ # for attempt in range(max_retries + 1):
174
+ # try:
175
+ # prompt = """
176
+ # You are given an image from an educational curriculum specification. The image may contain either:
177
+ # 1) A main topic heading in the format: "<number> <Topic Name>", for example "2 Algebra and functions continued".
178
+ # 2) A subtopic heading in the format "<number>.<number>", for example "2.5", "2.6", or "3.4".
179
+ # 3) Possibly no relevant text at all.
180
+
181
+ # Your task:
182
+ # 1. If the cell shows a main topic, extract the topic name (e.g. "2 Algebra and functions") and place it in the JSON key "title".
183
+ # 2. If the cell shows one or more subtopic numbers (e.g. "2.5", "2.6"), collect them in the JSON key "subtopics" as an array of strings.
184
+ # 3. If neither a main topic nor subtopic is detected, return empty values.
185
+
186
+ # Output only valid JSON in this exact structure, with no extra text or explanation:
187
+
188
+ # {
189
+ # "title": "...",
190
+ # "subtopics": [...]
191
+ # }
192
+
193
+ # Where:
194
+ # - "title" is the recognized main topic (if any). Otherwise, an empty string.
195
+ # - "subtopics" is an array of recognized subtopic numbers (e.g. ["2.5", "2.6"]). Otherwise, an empty array.
196
+
197
+ # Examples:
198
+ # 1. If the image text is "2 Algebra and functions continued", return:
199
+ # {
200
+ # "title": "2 Algebra and functions continued",
201
+ # "subtopics": []
202
+ # }
203
+
204
+ # 2. If the image text is "2.5 Solve linear and quadratic inequalities ...", return:
205
+ # {
206
+ # "title": "",
207
+ # "subtopics": ["2.5"]
208
+ # }
209
+
210
+ # 3. If the image text is "2.6 Manipulate polynomials algebraically ...", return:
211
+ # {
212
+ # "title": "",
213
+ # "subtopics": ["2.6"]
214
+ # }
215
+
216
+ # If you cannot recognize any text matching these patterns, or if nothing is found, return:
217
+ # {
218
+ # "title": "",
219
+ # "subtopics": []
220
+ # }
221
+ # """
222
+ # global _GEMINI_CLIENT
223
+ # if _GEMINI_CLIENT is None:
224
+ # _GEMINI_CLIENT = genai.Client(api_key=api_key)
225
+ # client = _GEMINI_CLIENT
226
+
227
+ # resp = client.models.generate_content(
228
+ # model="gemini-2.0-flash",
229
+ # contents=[
230
+ # {
231
+ # "parts": [
232
+ # {"text": prompt},
233
+ # {
234
+ # "inline_data": {
235
+ # "mime_type": "image/jpeg",
236
+ # "data": base64.b64encode(image_data).decode("utf-8")
237
+ # }
238
+ # }
239
+ # ]
240
+ # }
241
+ # ],
242
+ # config=types.GenerateContentConfig(temperature=0.0)
243
+ # )
244
+ # if not resp or not resp.text:
245
+ # return {"title": "", "subtopics": []}
246
+
247
+ # raw = resp.text.strip()
248
+
249
+ # data = json.loads(raw)
250
+ # title = data.get("title", "")
251
+ # subtopics = data.get("subtopics", [])
252
+ # if not isinstance(subtopics, list):
253
+ # subtopics = []
254
+ # return {"title": title, "subtopics": subtopics}
255
+
256
+ # except Exception as e:
257
+ # if attempt < max_retries:
258
+ # time.sleep(0.5)
259
+ # else:
260
+ # return {"title": "", "subtopics": []}
261
+
262
+ # return {"title": "", "subtopics": []}
263
+
264
+ def call_gemini_for_subtopic_identification_image(image_data: bytes, api_key: str, max_retries: int = 1) -> dict:
265
  for attempt in range(max_retries + 1):
266
  try:
267
+ prompt = """
268
+ You are given an image from an educational curriculum specification. The image may contain either:
269
+ 1) A main topic heading in the format: "<number> <Topic Name>", for example "2 Algebra and functions continued".
270
+ 2) A subtopic heading in the format "<number>.<number>", for example "2.5", "2.6", or "3.4".
271
+ 3) Possibly no relevant text at all.
272
+
273
+ Your task:
274
+ 1. If the cell shows a main topic, extract the topic name (e.g. "2 Algebra and functions") and place it in the JSON key "title".
275
+ 2. If the cell shows one or more subtopic numbers (e.g. "2.5", "2.6"), collect them in the JSON key "subtopics" as an array of strings.
276
+ 3. If neither a main topic nor subtopic is detected, return empty values.
277
+
278
+ Output only valid JSON in this exact structure, with no extra text or explanation:
279
+
280
+ Output only valid JSON in this exact structure, with no extra text or explanation:
281
+
282
+ {
283
+ "title": "...",
284
+ "subtopics": [...]
285
+ }
286
+
287
+ Where:
288
+ - "title" is the recognized main topic (if any). Otherwise, an empty string.
289
+ - "subtopics" is an array of recognized subtopic numbers (e.g. ["2.5", "2.6"]). Otherwise, an empty array.
290
+
291
+ Examples:
292
+ 1. If the image text is "2 Algebra and functions continued", return:
293
+ {
294
+ "title": "2 Algebra and functions continued",
295
+ "subtopics": []
296
+ }
297
+
298
+ 2. If the image text is "2.5 Solve linear and quadratic inequalities ...", return:
299
+ {
300
+ "title": "",
301
+ "subtopics": ["2.5"]
302
+ }
303
+
304
+ 3. If the image text is "2.6 Manipulate polynomials algebraically ...", return:
305
+ {
306
+ "title": "",
307
+ "subtopics": ["2.6"]
308
+ }
309
+
310
+ If you cannot recognize any text matching these patterns, or if nothing is found, return:
311
  {
312
+ "title": "",
313
+ "subtopics": []
314
  }
 
 
315
  """
316
  global _GEMINI_CLIENT
317
  if _GEMINI_CLIENT is None:
 
320
 
321
  resp = client.models.generate_content(
322
  model="gemini-2.0-flash",
323
+ contents=[
324
+ {
325
+ "parts": [
326
+ {"text": prompt},
327
+ {
328
+ "inline_data": {
329
+ "mime_type": "image/jpeg",
330
+ "data": base64.b64encode(image_data).decode("utf-8")
331
+ }
332
+ }
333
+ ]
334
+ }
335
+ ],
336
  config=types.GenerateContentConfig(temperature=0.0)
337
  )
338
+ # Log the raw response
339
+ logger.info(f"Gemini subtopic extraction raw response: {resp.text if resp and resp.text else 'None'}")
340
+
341
  if not resp or not resp.text:
342
+ logger.warning("Gemini returned an empty response for subtopic extraction.")
343
  return {"title": "", "subtopics": []}
344
 
345
  raw = resp.text.strip()
346
+ data = json.loads(raw)
347
+ title = data.get("title", "")
348
+ subtopics = data.get("subtopics", [])
349
+ if not isinstance(subtopics, list):
350
+ subtopics = []
351
+ return {"title": title, "subtopics": subtopics}
 
 
 
 
 
 
 
352
 
353
  except Exception as e:
354
+ logger.error(f"Gemini subtopic identification error on attempt {attempt}: {e}")
355
  if attempt < max_retries:
356
  time.sleep(0.5)
357
  else:
358
  return {"title": "", "subtopics": []}
359
 
360
+ return {"title": "", "subtopics": []}
361
+
362
 
363
 
364
  class S3ImageWriter(DataWriter):
 
436
  logger.warning(f"No image data found for S3 key {s3_key}. Skipping.")
437
  continue
438
 
439
+ # Write temporary file for processing.
440
  with tempfile.NamedTemporaryFile(delete=False, suffix=".jpg") as temp_file:
441
  temp_file.write(img_data)
442
  temp_path = temp_file.name
443
 
444
  try:
445
+ # 1) Extract row bounding boxes.
446
  if col_type.lower() == 'two':
447
  extractor = TableExtractor(
448
  skip_header=True,
 
458
  subtopic_threshold=0.2
459
  )
460
  row_boxes = extractor.process_image(temp_path)
461
+ logger.info(f"Extracted {len(row_boxes)} rows from {temp_path}")
 
 
462
  for i, row in enumerate(row_boxes):
463
+ logger.info(f"Row {i} has {len(row)} cells")
 
 
464
 
465
+ # out_folder = temp_path + "_rows"
466
+ # os.makedirs(out_folder, exist_ok=True)
467
+ out_folder = os.path.join(os.path.dirname(temp_path), os.path.basename(temp_path) + "_rows")
468
+ os.makedirs(out_folder, exist_ok=True)
469
 
470
+ extractor.save_extracted_cells(temp_path, row_boxes, out_folder)
471
+ logger.info(f"Files in {out_folder}:")
472
+ for root, dirs, files in os.walk(out_folder):
473
+ logger.info(f"{root}: {files}")
474
 
475
+ recognized_main_topic = ""
476
+ main_topic_image_key = None
477
+ recognized_subtopics = []
478
 
479
+ # 2) Loop over each cell image.
480
+ for i, row in enumerate(row_boxes):
481
+ row_dir = os.path.join(out_folder, f"row_{i}")
482
+ for j, _ in enumerate(row):
483
+ cell_path = os.path.join(row_dir, f"col_{j}.png")
484
+ if not os.path.isfile(cell_path):
485
+ alternative_path = os.path.join(row_dir, f"col_{j}.jpg")
486
+ if os.path.isfile(alternative_path):
487
+ cell_path = alternative_path
488
+ else:
489
+ logger.warning(f"Cell image not found: {cell_path}")
490
+ continue
491
+
492
+ with open(cell_path, "rb") as cf:
493
+ cell_image_data = cf.read()
494
+
495
+ # Save cell image to S3.
496
+ cell_key = f"{self.base_path}cells/{os.path.basename(s3_key)}_r{i}_c{j}.png"
497
+ self.s3_writer.write(cell_key, cell_image_data)
498
+
499
+ # Log before calling Gemini.
500
+ logger.debug(f"About to call Gemini for cell image: {cell_path}")
501
+ info = call_gemini_for_subtopic_identification_image(cell_image_data, self.gemini_api_key)
502
+ logger.info(f"Gemini subtopic extraction result for cell {cell_path}: {info}")
503
+
504
+ if info["title"] and not recognized_main_topic:
505
+ recognized_main_topic = info["title"]
506
+ main_topic_image_key = cell_key
507
+
508
+ for st in info["subtopics"]:
509
+ recognized_subtopics.append({
510
+ "title": st,
511
+ "contents": [{"type": "image", "key": cell_key}],
512
+ "children": []
513
+ })
514
+
515
+ # 3) Build final JSON for this table.
516
  final_json = {
517
+ "title": recognized_main_topic,
518
+ "contents": [],
519
+ "children": recognized_subtopics
 
 
520
  }
521
+ if main_topic_image_key:
522
+ final_json["contents"].append({"type": "image", "key": main_topic_image_key})
 
 
 
 
 
523
 
524
+ # Save the final JSON.
525
  self.extracted_subtopics[s3_key] = final_json
526
 
527
+ # Optionally, create a snippet to replace the markdown line.
528
+ snippet = ["**Extracted table cells:**"]
529
+ for i, row in enumerate(row_boxes):
530
+ for j, _ in enumerate(row):
531
+ snippet.append(f"![Row {i} Col {j}]({self.base_path}cells/{os.path.basename(s3_key)}_r{i}_c{j}.jpg)")
532
  new_snip = "\n".join(snippet)
533
  old_line = f"![HAS TO BE PROCESSED - {col_type} column table]({s3_key})"
534
  md_content = md_content.replace(old_line, new_snip)
 
540
 
541
  return md_content
542
 
543
+
544
+
545
  def post_process(self, key: str, md_content: str) -> str:
546
  return asyncio.run(self.post_process_async(key, md_content))
547
 
 
618
  with open(temp_path, "wb") as f:
619
  f.write(desc_item["data"])
620
  try:
621
+ if col_type.lower() == 'two': #check for table_row_extr script for more details
622
  extractor = TableExtractor(
623
  skip_header=True,
624
  merge_two_col_rows=True,
 
983
 
984
  if __name__ == "__main__":
985
  input_pdf = "/home/user/app/input_output/a-level-pearson-mathematics-specification.pdf"
986
+ output_dir = "/home/user/app/pearson_json"
987
  gemini_key = os.getenv("GEMINI_API_KEY", "AIzaSyDtoakpXa2pjJwcQB6TJ5QaXHNSA5JxcrU")
988
  try:
989
  processor = MineruNoTextProcessor(output_folder=output_dir, gemini_api_key=gemini_key)
 
991
  logger.info("Processing completed successfully.")
992
  # The result includes final_markdown and subtopics_extracted
993
  except Exception as e:
994
+ logger.error(f"Processing failed: {e}")
topic_extract_arsenii.py ADDED
@@ -0,0 +1,883 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ import os
3
+ import re
4
+ import gc
5
+ import json
6
+ import logging
7
+ import fitz
8
+ import boto3
9
+ import base64
10
+ import time
11
+ import asyncio
12
+ import tempfile
13
+ import requests
14
+ from io import BytesIO
15
+ from typing import List, Dict, Any
16
+
17
+ import torch
18
+ import cv2
19
+ import numpy as np
20
+
21
+ from google import genai
22
+ from google.genai import types
23
+
24
+ from magic_pdf.data.dataset import PymuDocDataset
25
+ from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
26
+ from magic_pdf.data.data_reader_writer.base import DataWriter
27
+ from table_row_extraction import TableExtractor
28
+
29
+ logging.basicConfig(level=logging.INFO)
30
+ logger = logging.getLogger(__name__)
31
+ logger.setLevel(logging.INFO)
32
+ file_handler = logging.FileHandler("topic_extraction_ars.log")
33
+ file_handler.setFormatter(logging.Formatter("%(asctime)s [%(levelname)s] %(name)s - %(message)s"))
34
+ logger.addHandler(file_handler)
35
+
36
+ _GEMINI_CLIENT = None
37
+
38
+ def unify_whitespace(text: str) -> str:
39
+ return re.sub(r"\s+", " ", text).strip()
40
+
41
+ def find_all_occurrences(pdf_bytes: bytes, search_text: str) -> List[int]:
42
+ doc = fitz.open(stream=pdf_bytes, filetype="pdf")
43
+ st_norm = unify_whitespace(search_text)
44
+ found = []
45
+ for i in range(doc.page_count):
46
+ raw = doc[i].get_text("raw")
47
+ norm = unify_whitespace(raw)
48
+ if st_norm in norm:
49
+ found.append(i)
50
+ doc.close()
51
+ return sorted(found)
52
+
53
+ def create_subset_pdf(original_pdf_bytes: bytes, page_indices: List[int]) -> bytes:
54
+ if not page_indices:
55
+ raise ValueError("No page indices provided for subset creation.")
56
+ doc = fitz.open(stream=original_pdf_bytes, filetype="pdf")
57
+ new_doc = fitz.open()
58
+ for p in sorted(set(page_indices)):
59
+ if 0 <= p < doc.page_count:
60
+ new_doc.insert_pdf(doc, from_page=p, to_page=p)
61
+ else:
62
+ logger.error(f"Page index {p} out of range (0..{doc.page_count - 1}).")
63
+ raise ValueError(f"Page index {p} out of range.")
64
+ subset_bytes = new_doc.tobytes()
65
+ new_doc.close()
66
+ doc.close()
67
+ return subset_bytes
68
+
69
+ class s3Writer:
70
+ def __init__(self, ak: str, sk: str, bucket: str, endpoint_url: str):
71
+ self.bucket = bucket
72
+ self.client = boto3.client(
73
+ 's3',
74
+ aws_access_key_id=ak,
75
+ aws_secret_access_key=sk,
76
+ endpoint_url=endpoint_url
77
+ )
78
+
79
+ def write(self, path: str, data: bytes) -> None:
80
+ try:
81
+ file_obj = BytesIO(data)
82
+ self.client.upload_fileobj(
83
+ file_obj,
84
+ self.bucket,
85
+ path
86
+ )
87
+ logger.info(f"Uploaded to S3: {path}")
88
+ except Exception as e:
89
+ logger.error(f"Failed to upload to S3: {str(e)}")
90
+ raise
91
+
92
+ def preprocess_image(image_data: bytes, max_dim: int = 600, quality: int = 60) -> bytes:
93
+ arr = np.frombuffer(image_data, np.uint8)
94
+ img = cv2.imdecode(arr, cv2.IMREAD_COLOR)
95
+ if img is not None:
96
+ h, w, _ = img.shape
97
+ if max(h, w) > max_dim:
98
+ scale = max_dim / float(max(h, w))
99
+ new_w = int(w * scale)
100
+ new_h = int(h * scale)
101
+ img = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_AREA)
102
+ encode_params = [int(cv2.IMWRITE_JPEG_QUALITY), quality]
103
+ success, enc = cv2.imencode(".jpg", img, encode_params)
104
+ if success:
105
+ return enc.tobytes()
106
+ return image_data
107
+
108
+ def call_gemini_for_table_classification(image_data: bytes, api_key: str, max_retries: int = 1) -> str:
109
+ """
110
+ Existing Gemini call to classify an image as TWO_COLUMN, THREE_COLUMN, or NO_TABLE.
111
+ """
112
+ for attempt in range(max_retries + 1):
113
+ try:
114
+ prompt = """You are given an image. Determine if it shows a table that has exactly 2 or 3 columns.
115
+ The three-column 'table' image includes such key features:
116
+ - Three columns header
117
+ - Headers like 'Topics', 'Content', 'Guidelines'
118
+ - Possibly sections (e.g. 8.4, 9.1)
119
+ The two-column 'table' image includes such key features:
120
+ - Two columns
121
+ - Headers like 'Subject content' and 'Additional information'
122
+ - Possibly sections (e.g. 2.1, 3.4)
123
+ If the image is a relevant table with 2 columns, respond with 'TWO_COLUMN'.
124
+ If the image is a relevant table with 3 columns, respond with 'THREE_COLUMN'.
125
+ If the image does not show a table at all, respond with 'NO_TABLE'.
126
+ Return only one of these exact labels.
127
+ """
128
+ global _GEMINI_CLIENT
129
+ if _GEMINI_CLIENT is None:
130
+ _GEMINI_CLIENT = genai.Client(api_key=api_key)
131
+ client = _GEMINI_CLIENT
132
+
133
+ resp = client.models.generate_content(
134
+ model="gemini-2.0-flash",
135
+ contents=[
136
+ {
137
+ "parts": [
138
+ {"text": prompt},
139
+ {
140
+ "inline_data": {
141
+ "mime_type": "image/jpeg",
142
+ "data": base64.b64encode(image_data).decode('utf-8')
143
+ }
144
+ }
145
+ ]
146
+ }
147
+ ],
148
+ config=types.GenerateContentConfig(temperature=0.0)
149
+ )
150
+ if resp and resp.text:
151
+ classification = resp.text.strip().upper()
152
+ if "THREE" in classification:
153
+ return "THREE_COLUMN"
154
+ elif "TWO" in classification:
155
+ return "TWO_COLUMN"
156
+ return "NO_TABLE"
157
+ except Exception as e:
158
+ logger.error(f"Gemini table classification error: {e}")
159
+ if "503" in str(e):
160
+ return "NO_TABLE"
161
+ if attempt < max_retries:
162
+ time.sleep(0.5)
163
+ else:
164
+ return "NO_TABLE"
165
+
166
+ async def classify_image_async(image_data: bytes, api_key: str, max_retries: int = 1) -> str:
167
+ loop = asyncio.get_event_loop()
168
+ preprocessed = preprocess_image(image_data)
169
+ return await loop.run_in_executor(None, call_gemini_for_table_classification, preprocessed, api_key, max_retries)
170
+
171
+
172
+ def call_gemini_for_subtopic_identification_image(image_data: bytes, api_key: str, max_retries: int = 1) -> dict:
173
+ """
174
+ Sends the *image* (not text) of a table cell to Gemini to identify:
175
+ - A main topic heading in the format: "<number> <Topic Name>", e.g. "2 Algebra and functions"
176
+ - A subtopic heading in the format: "<number>.<number>", e.g. "2.5", "3.4"
177
+ Returns a dict of the form:
178
+ {
179
+ "title": "<the recognized main topic or empty if not found>",
180
+ "subtopics": ["2.5", "2.6", ...]
181
+ }
182
+ """
183
+ for attempt in range(max_retries + 1):
184
+ try:
185
+ # Prompt specifically instructs Gemini to read the image’s text and extract
186
+ # either a main topic or subtopic heading if present:
187
+ prompt = """
188
+ You are given an image of a table cell from an educational curriculum specification.
189
+ The text in this cell may contain:
190
+ 1) A main topic heading in the format "<number> <Topic Name>", for example: "2 Algebra and functions"
191
+ 2) A subtopic heading in the format "<number>.<number>", for example: "2.5" or "3.4"
192
+ Identify if the cell contains exactly one main topic or subtopic.
193
+ Return a valid JSON object with the keys "title" and "subtopics" of the form:
194
+ {{
195
+ "title": "2 Algebra and functions",
196
+ "subtopics": ["2.5", "2.6"]
197
+ }}
198
+ If you find a main topic (like '2 Algebra and functions'), put it in "title".
199
+ If you find subtopic numbers (like '2.5', '3.4'), put them in the "subtopics" array.
200
+ """
201
+
202
+ # Re-use or initialize your global Gemini client:
203
+ global _GEMINI_CLIENT
204
+ if _GEMINI_CLIENT is None:
205
+ _GEMINI_CLIENT = genai.Client(api_key=api_key)
206
+ client = _GEMINI_CLIENT
207
+
208
+ # Send the prompt + image to Gemini:
209
+ resp = client.models.generate_content(
210
+ model="gemini-2.0-flash",
211
+ contents=[
212
+ {
213
+ "parts": [
214
+ {"text": prompt},
215
+ {
216
+ "inline_data": {
217
+ "mime_type": "image/jpeg",
218
+ "data": base64.b64encode(image_data).decode("utf-8")
219
+ }
220
+ }
221
+ ]
222
+ }
223
+ ],
224
+ config=types.GenerateContentConfig(temperature=0.0)
225
+ )
226
+ # if not resp or not resp.text:
227
+ # return {"title": "", "subtopics": []}
228
+
229
+ raw = resp.text.strip().replace("```json", "").replace("```", "")
230
+ logger.info(f"== RAW == {raw}")
231
+
232
+ # Attempt to parse JSON from Gemini’s response:
233
+ data = json.loads(raw)
234
+ title = data.get("title", "")
235
+ subtopics = data.get("subtopics", [])
236
+ if not isinstance(subtopics, list):
237
+ subtopics = []
238
+ return {"title": title, "subtopics": subtopics}
239
+
240
+ except Exception as e:
241
+ # Retry logic if you like:
242
+ if attempt < max_retries:
243
+ time.sleep(0.5)
244
+ else:
245
+ return {"title": "", "subtopics": []}
246
+ # fallback:
247
+ return {"title": "", "subtopics": []}
248
+
249
+
250
+
251
+
252
+ class S3ImageWriter(DataWriter):
253
+ def __init__(self, s3_writer: s3Writer, base_path: str, gemini_api_key: str):
254
+ self.s3_writer = s3_writer
255
+ self.base_path = base_path if base_path.endswith("/") else base_path + "/"
256
+ self.gemini_api_key = gemini_api_key
257
+ self.descriptions = {}
258
+ self._img_count = 0
259
+ self.extracted_tables = {}
260
+ # New attribute to store final subtopic JSON
261
+ self.extracted_subtopics = {}
262
+
263
+ def write(self, path: str, data: bytes) -> None:
264
+ self._img_count += 1
265
+ unique_id = f"img_{self._img_count}.jpg"
266
+ s3_key = f"{self.base_path}{unique_id}"
267
+ self.s3_writer.write(s3_key, data)
268
+ self.descriptions[path] = {
269
+ "data": data,
270
+ "s3_path": s3_key,
271
+ "table_classification": "NO_TABLE",
272
+ "final_alt": ""
273
+ }
274
+
275
+ async def post_process_async(self, key: str, md_content: str) -> str:
276
+ logger.info("Classifying images to detect tables.")
277
+ tasks = {
278
+ p: asyncio.create_task(classify_image_async(info["data"], self.gemini_api_key))
279
+ for p, info in self.descriptions.items()
280
+ }
281
+ results = await asyncio.gather(*tasks.values(), return_exceptions=True)
282
+ for p, result in zip(tasks.keys(), results):
283
+ if isinstance(result, Exception):
284
+ logger.error(f"Table classification error for {p}: {result}")
285
+ self.descriptions[p]['table_classification'] = "NO_TABLE"
286
+ else:
287
+ self.descriptions[p]['table_classification'] = result
288
+
289
+ # 2) Replace the original markdown references with alt text
290
+ for p, info in self.descriptions.items():
291
+ cls = info['table_classification']
292
+ if cls == "TWO_COLUMN":
293
+ info['final_alt'] = "HAS TO BE PROCESSED - two column table"
294
+ elif cls == "THREE_COLUMN":
295
+ info['final_alt'] = "HAS TO BE PROCESSED - three column table"
296
+ else:
297
+ info['final_alt'] = "NO_TABLE image"
298
+ md_content = md_content.replace(f"![]({key}{p})", f"![{info['final_alt']}]({info['s3_path']})")
299
+
300
+ md_content = await self._process_table_images_in_markdown(key, md_content)
301
+
302
+ # Filter final lines to keep only lines with images
303
+ final_lines = [
304
+ line.strip() for line in md_content.split("\n")
305
+ if re.match(r"^\!\[.*\]\(.*\)", line.strip())
306
+ ]
307
+ return "\n".join(final_lines)
308
+
309
+
310
+ async def _process_table_images_in_markdown(self, key: str, md_content: str) -> str:
311
+ pat = r"!\[HAS TO BE PROCESSED - (two|three) column table\]\(([^)]+)\)"
312
+ matches = re.findall(pat, md_content, flags=re.IGNORECASE)
313
+ if not matches:
314
+ return md_content
315
+
316
+ for (col_type, s3_key) in matches:
317
+ logger.info(f"Processing table image: {s3_key}, columns={col_type}")
318
+ img_data = None
319
+ for desc in self.descriptions.values():
320
+ if desc.get("s3_path") == s3_key:
321
+ img_data = desc.get("data")
322
+ break
323
+ if img_data is None:
324
+ logger.warning(f"No image data found for S3 key {s3_key}. Skipping.")
325
+ continue
326
+
327
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".jpg") as temp_file:
328
+ temp_file.write(img_data)
329
+ temp_path = temp_file.name
330
+
331
+ try:
332
+ if col_type.lower() == 'two':
333
+ extractor = TableExtractor(
334
+ skip_header=True,
335
+ merge_two_col_rows=True,
336
+ enable_subtopic_merge=True,
337
+ subtopic_threshold=0.2
338
+ )
339
+ else:
340
+ extractor = TableExtractor(
341
+ skip_header=True,
342
+ merge_two_col_rows=False,
343
+ enable_subtopic_merge=False,
344
+ subtopic_threshold=0.2
345
+ )
346
+ row_boxes = extractor.process_image(temp_path)
347
+
348
+ #save cell images to S3 or local
349
+ out_folder = temp_path + "_rows"
350
+ os.makedirs(out_folder, exist_ok=True)
351
+ extractor.save_extracted_cells(temp_path, row_boxes, out_folder)
352
+
353
+ recognized_main_topic = None
354
+ recognized_subtopics = []
355
+
356
+ for i, row in enumerate(row_boxes):
357
+ row_dir = os.path.join(out_folder, f"row_{i}")
358
+ for j, _ in enumerate(row):
359
+ cell_path = os.path.join(row_dir, f"col_{j}.jpg")
360
+ if not os.path.isfile(cell_path):
361
+ continue
362
+
363
+ with open(cell_path, "rb") as cf:
364
+ cell_image_data = cf.read()
365
+
366
+ # store that cell image to S3
367
+ cell_key = f"{self.base_path}cells/{os.path.basename(s3_key)}_r{i}_c{j}.jpg"
368
+ self.s3_writer.write(cell_key, cell_image_data)
369
+
370
+ # Call Gemini with the cell image
371
+ info = call_gemini_for_subtopic_identification_image(cell_image_data, self.gemini_api_key)
372
+ logger.info(f"== INFO == {info}")
373
+ # e.g. info = {"title": "2 Algebra and functions", "subtopics": ["2.5"]}
374
+
375
+ # 3d) Merge the recognized topic/subtopics
376
+ if info["title"]:
377
+ recognized_main_topic = info["title"]
378
+ if info["subtopics"]:
379
+ recognized_subtopics.extend(info["subtopics"])
380
+
381
+ snippet = ["**Extracted table cells:**"]
382
+ cell_texts = []
383
+ for i, row in enumerate(row_boxes):
384
+ for j, box in enumerate(row):
385
+ cell_key = f"{self.base_path}cells/table_s3_{os.path.basename(s3_key)}_r{i}_c{j}.jpg"
386
+ self.s3_writer.write(cell_key, img_data) # or cell_data if you truly cropped
387
+
388
+ text = "..." # placeholder
389
+ cell_texts.append(text)
390
+
391
+ snippet.append(f"![Row {i} Col {j}]({cell_key})")
392
+
393
+ final_json = {
394
+ "title": recognized_main_topic,
395
+ "contents": [
396
+ {
397
+ "type": "image",
398
+ "key": s3_key
399
+ }
400
+ ],
401
+ "children": []
402
+ }
403
+ for st in recognized_subtopics:
404
+ final_json["children"].append({
405
+ "title": st,
406
+ "contents": [
407
+ {"type": "image", "key": f"subtopic_{st}_example.jpg"}
408
+ ]
409
+ })
410
+
411
+ self.extracted_subtopics[s3_key] = final_json
412
+
413
+
414
+ # Replace the original table image line in the markdown with the snippet
415
+ new_snip = "\n".join(snippet)
416
+ old_line = f"![HAS TO BE PROCESSED - {col_type} column table]({s3_key})"
417
+ md_content = md_content.replace(old_line, new_snip)
418
+
419
+ snippet = ["**Extracted table cells:**"]
420
+ for i, row in enumerate(row_boxes):
421
+ for j, _ in enumerate(row):
422
+ snippet.append(f"![Row {i} Col {j}]({self.base_path}cells/{os.path.basename(s3_key)}_r{i}_c{j}.jpg)")
423
+ new_snip = "\n".join(snippet)
424
+ old_line = f"![HAS TO BE PROCESSED - {col_type} column table]({s3_key})"
425
+ md_content = md_content.replace(old_line, new_snip)
426
+
427
+ except Exception as e:
428
+ logger.error(f"Error processing table image {s3_key}: {e}")
429
+ finally:
430
+ os.remove(temp_path)
431
+
432
+ return md_content
433
+
434
+ def post_process(self, key: str, md_content: str) -> str:
435
+ return asyncio.run(self.post_process_async(key, md_content))
436
+
437
+
438
+ class LocalImageWriter(DataWriter):
439
+ def __init__(self, output_folder: str, gemini_api_key: str):
440
+ self.output_folder = output_folder
441
+ os.makedirs(self.output_folder, exist_ok=True)
442
+ self.descriptions = {}
443
+ self._img_count = 0
444
+ self.gemini_api_key = gemini_api_key
445
+
446
+ self.extracted_tables = {}
447
+
448
+ def write(self, path: str, data: bytes) -> None:
449
+ self._img_count += 1
450
+ unique_id = f"img_{self._img_count}.jpg"
451
+ self.descriptions[path] = {
452
+ "data": data,
453
+ "relative_path": unique_id,
454
+ "table_classification": "NO_TABLE",
455
+ "final_alt": ""
456
+ }
457
+ # Also save the original image locally for testing.
458
+ image_path = os.path.join(self.output_folder, unique_id)
459
+ with open(image_path, "wb") as f:
460
+ f.write(data)
461
+
462
+ async def post_process_async(self, key: str, md_content: str) -> str:
463
+ logger.info("Classifying images to detect tables.")
464
+ tasks = []
465
+ for p, info in self.descriptions.items():
466
+ tasks.append((p, classify_image_async(info["data"], self.gemini_api_key)))
467
+ for p, task in tasks:
468
+ try:
469
+ classification = await task
470
+ self.descriptions[p]['table_classification'] = classification
471
+ except Exception as e:
472
+ logger.error(f"Table classification error: {e}")
473
+ self.descriptions[p]['table_classification'] = "NO_TABLE"
474
+ for p, info in self.descriptions.items():
475
+ cls = info['table_classification']
476
+ if cls == "TWO_COLUMN":
477
+ info['final_alt'] = "HAS TO BE PROCESSED - two column table"
478
+ elif cls == "THREE_COLUMN":
479
+ info['final_alt'] = "HAS TO BE PROCESSED - three column table"
480
+ else:
481
+ info['final_alt'] = "NO_TABLE image"
482
+ md_content = md_content.replace(f"![]({key}{p})", f"![{info['final_alt']}]({info['relative_path']})")
483
+ md_content = self._process_table_images_in_markdown(md_content)
484
+ final_lines = []
485
+ for line in md_content.split("\n"):
486
+ if re.match(r"^\!\[.*\]\(.*\)", line.strip()):
487
+ final_lines.append(line.strip())
488
+ return "\n".join(final_lines)
489
+
490
+ def _process_table_images_in_markdown(self, md_content: str) -> str:
491
+ pat = r"!\[HAS TO BE PROCESSED - (two|three) column table\]\(([^)]+)\)"
492
+ matches = re.findall(pat, md_content, flags=re.IGNORECASE)
493
+ if not matches:
494
+ return md_content
495
+ for (col_type, image_id) in matches:
496
+ logger.info(f"Processing table image => {image_id}, columns={col_type}")
497
+ temp_path = os.path.join(self.output_folder, image_id)
498
+ desc_item = None
499
+ for k, val in self.descriptions.items():
500
+ if val["relative_path"] == image_id:
501
+ desc_item = val
502
+ break
503
+ if not desc_item:
504
+ logger.warning(f"No matching image data for {image_id}, skipping extraction.")
505
+ continue
506
+ if not os.path.exists(temp_path):
507
+ with open(temp_path, "wb") as f:
508
+ f.write(desc_item["data"])
509
+ try:
510
+ if col_type.lower() == 'two': #check for table_row_extr script for more details
511
+ extractor = TableExtractor(
512
+ skip_header=True,
513
+ merge_two_col_rows=True,
514
+ enable_subtopic_merge=True,
515
+ subtopic_threshold=0.2
516
+ )
517
+ else:
518
+ extractor = TableExtractor(
519
+ skip_header=True,
520
+ merge_two_col_rows=False,
521
+ enable_subtopic_merge=False,
522
+ subtopic_threshold=0.2
523
+ )
524
+ row_boxes = extractor.process_image(temp_path)
525
+ out_folder = temp_path + "_rows"
526
+ os.makedirs(out_folder, exist_ok=True)
527
+ extractor.save_extracted_cells(temp_path, row_boxes, out_folder)
528
+ # List all extracted cell images relative to the output folder.
529
+ extracted_cells = []
530
+ for root, dirs, files in os.walk(out_folder):
531
+ for file in files:
532
+ rel_path = os.path.relpath(os.path.join(root, file), self.output_folder)
533
+ extracted_cells.append(rel_path)
534
+ # Save mapping for testing.
535
+ self.extracted_tables[image_id] = extracted_cells
536
+ snippet = ["**Extracted table cells:**"]
537
+ for i, row in enumerate(row_boxes):
538
+ row_dir = os.path.join(out_folder, f"row_{i}")
539
+ for j, _ in enumerate(row):
540
+ cell_file = f"col_{j}.jpg"
541
+ cell_path = os.path.join(row_dir, cell_file)
542
+ relp = os.path.relpath(cell_path, self.output_folder)
543
+ snippet.append(f"![Row {i} Col {j}]({relp})")
544
+ new_snip = "\n".join(snippet)
545
+ old_line = f"![HAS TO BE PROCESSED - {col_type} column table]({image_id})"
546
+ md_content = md_content.replace(old_line, new_snip)
547
+ except Exception as e:
548
+ logger.error(f"Error processing table image {image_id}: {e}")
549
+ finally:
550
+ if os.path.exists(temp_path):
551
+ os.remove(temp_path)
552
+ return md_content
553
+
554
+ def post_process(self, key: str, md_content: str) -> str:
555
+ return asyncio.run(self.post_process_async(key, md_content))
556
+
557
+ class GeminiTopicExtractor:
558
+ def __init__(self, api_key: str = None, num_pages: int = 14):
559
+ self.api_key = api_key or os.getenv("GEMINI_API_KEY", "")
560
+ self.num_pages = num_pages
561
+
562
+ def extract_subtopics(self, pdf_path: str) -> Dict[str, List[int]]:
563
+ first_pages_text = self._read_first_pages_raw(pdf_path, self.num_pages)
564
+ if not first_pages_text.strip():
565
+ logger.error("No text from first pages => cannot extract subtopics.")
566
+ return {}
567
+ prompt = f"""
568
+ You have the first pages of a PDF specification, including a table of contents.
569
+ Instructions:
570
+ 1. Identify the 'Contents' section listing all topics, subtopics, and their corresponding pages.
571
+ 2. Identify the major academic subtopics (common desired topic names "Paper X", "Theme X", "Content of X", "AS Unit X", "A2 Unit X", or similar headings).
572
+ 3. For each subtopic, give the range of pages [start_page, end_page] (1-based) from the table of contents.
573
+ 4. Output only valid JSON of the form:
574
+ {{
575
+ "Subtopic A": [start_page, end_page],
576
+ "Subtopic B": [start_page, end_page]
577
+ }}
578
+ 5. If you can't find any subtopics, return an empty JSON.
579
+ Important notes:
580
+ - The correct "end_page" must be the page number of the next topic or subtopic minus 1.
581
+ - The final output must be valid JSON only, with no extra text or code blocks.
582
+ Examples:
583
+ 1. Given this table of contents:
584
+ 1 Introduction – 2
585
+ Why choose Edexcel A Level Mathematics? - 2
586
+ Supporting you in planning and implementing this qualification - 3
587
+ Qualification at a glance - 5
588
+ 2 Subject content and assessment information – 7
589
+ Paper 1 and Paper 2: Pure Mathematics - 11
590
+ Paper 3: Statistics and Mechanics - 30
591
+ Assessment Objectives - 40
592
+ 3 Administration and general information – 42
593
+ Entries - 42
594
+ Access arrangements, reasonable adjustments, special consideration and malpractice - 42
595
+ Student recruitment and progression - 45
596
+ Appendix 1: Formulae – 49
597
+ Appendix 2: Notation – 53
598
+ Appendix 3: Use of calculators – 59
599
+ Appendix 4: Assessment Objectives – 60
600
+ Appendix 5: The context for the development of this qualification – 62
601
+ Appendix 6: Transferable skills – 64
602
+ Appendix 7: Level 3 Extended Project qualification – 65
603
+ Appendix 8: Codes – 67
604
+ The correct output should be:
605
+ {{
606
+ "Paper 1 and Paper 2: Pure Mathematics": [11, 29],
607
+ "Paper 3: Statistics and Mechanics": [30, 42]
608
+ }}
609
+ 2. Given this table of contents:
610
+ Qualification at a glance – 1
611
+ Assessment Objectives and weightings - 4
612
+ Knowledge, skills and understanding – 5
613
+ Theme 1: Introduction to markets and market failure - 5
614
+ Theme 2: The UK economy – performance and policies - 11
615
+ Theme 3: Business behaviour and the labour market - 21
616
+ Theme 4: A global perspective - 29
617
+ Assessment – 39
618
+ Assessment summary - 39
619
+ Assessment objectives - 41
620
+ Assessment overview - 42
621
+ Breakdown of assessment objectives - 42
622
+ Synoptic assessment - 43
623
+ Discount code and performance tables - 43
624
+ Access arrangements, reasonable adjustments and special consideration - 44
625
+ Malpractice - 45
626
+ Equality Act 2010 and Pearson equality policy - 45
627
+ Synoptic assessment - 46
628
+ Awarding and reporting - 47
629
+ Other information – 49
630
+ Student recruitment -49
631
+ Prior learning and other requirements -49
632
+ Progression - 49
633
+ Appendix 1: Transferable skills – 53
634
+ Appendix 2: Level 3 Extended Project qualification – 55
635
+ Appendix 3: Quantitative skills – 59
636
+ Appendix 4: Codes – 61
637
+ Appendix 5: Index – 63
638
+ The correct output should be:
639
+ {{
640
+ "Theme 1: Introduction to markets and market failure": [5, 10],
641
+ "Theme 2: The UK economy – performance and policies": [11, 20],
642
+ "Theme 3: Business behaviour and the labour market": [21, 28],
643
+ "Theme 4: A global perspective": [29, 38]
644
+ }}
645
+ 3. You might also see sections like:
646
+ 2.1 AS Unit 1 11
647
+ 2.2 AS Unit 2 18
648
+ 2.3 A2 Unit 3 24
649
+ 2.4 A2 Unit 4 31
650
+ In that scenario, your output might look like:
651
+ {{
652
+ "2.1 AS Unit 1": [11, 17],
653
+ "2.2 AS Unit 2": [18, 23],
654
+ "2.3 A2 Unit 3": [24, 30],
655
+ "2.4 A2 Unit 4": [31, 35]
656
+ }}
657
+ 4. Another example might list subtopics:
658
+ 3.1 Overarching themes 11
659
+ 3.2 A: Proof 12
660
+ 3.3 B: Algebra and functions 13
661
+ 3.4 C: Coordinate geometry in the ( x , y ) plane 14
662
+ 3.5 D: Sequences and series 15
663
+ 3.6 E: Trigonometry 16
664
+ 3.7 F: Exponentials and logarithms 17
665
+ 3.8 G: Differentiation 18
666
+ 3.9 H: Integration 19
667
+ 3.10 I: Numerical methods 20
668
+ 3.11 J: Vectors 20
669
+ 3.12 K: Statistical sampling 21
670
+ 3.13 L: Data presentation and interpretation 21
671
+ 3.14 M: Probability 22
672
+ 3.15 N: Statistical distributions 23
673
+ 3.16 O: Statistical hypothesis testing 23
674
+ 3.17 P: Quantities and units in mechanics 24
675
+ 3.18 Q: Kinematics 24
676
+ 3.19 R: Forces and Newton’s laws 24
677
+ 3.20 S: Moments 25
678
+ 3.21 Use of data in statistics 26
679
+ Here the correct output might look like:
680
+ {{
681
+ "A: Proof": [12, 12],
682
+ "B: Algebra and functions": [13, 13],
683
+ ...
684
+ }}
685
+ Now, extract topics from this text:
686
+ {first_pages_text}
687
+ """
688
+ global _GEMINI_CLIENT
689
+ if _GEMINI_CLIENT is None:
690
+ _GEMINI_CLIENT = genai.Client(api_key=self.api_key)
691
+ client = _GEMINI_CLIENT
692
+ try:
693
+ response = client.models.generate_content(
694
+ model="gemini-2.0-flash",
695
+ contents=[prompt],
696
+ config=types.GenerateContentConfig(temperature=0.0)
697
+ )
698
+ if not response or not response.text:
699
+ logger.warning("No text from LLM => returning empty subtopics.")
700
+ return {}
701
+ raw_json = response.text.strip()
702
+ cleaned = raw_json.replace("```json", "").replace("```", "")
703
+ try:
704
+ data = json.loads(cleaned)
705
+ except Exception as json_err:
706
+ logger.error(f"JSON parsing error: {json_err}")
707
+ return {}
708
+ final_dict = {}
709
+ found_sub_dict = None
710
+ for k, v in data.items():
711
+ if isinstance(v, dict):
712
+ found_sub_dict = v
713
+ break
714
+ if found_sub_dict is not None:
715
+ for subk, rng in found_sub_dict.items():
716
+ if isinstance(rng, list) and len(rng) == 2:
717
+ final_dict[subk] = rng
718
+ else:
719
+ for subk, rng in data.items():
720
+ if isinstance(rng, list) and len(rng) == 2:
721
+ final_dict[subk] = rng
722
+ return final_dict
723
+ except Exception as e:
724
+ logger.error(f"Gemini subtopic extraction error: {e}")
725
+ return {}
726
+
727
+ def _read_first_pages_raw(self, pdf_path: str, num_pages: int) -> str:
728
+ text_parts = []
729
+ try:
730
+ if pdf_path.startswith("http://") or pdf_path.startswith("https://"):
731
+ response = requests.get(pdf_path)
732
+ if response.status_code != 200:
733
+ logger.error("Failed to download PDF from %s. Status code: %d", pdf_path, response.status_code)
734
+ return ""
735
+ pdf_bytes = response.content
736
+ else:
737
+ with open(pdf_path, "rb") as f:
738
+ pdf_bytes = f.read()
739
+ doc = fitz.open(stream=pdf_bytes, filetype="pdf")
740
+ pages_to_read = min(num_pages, doc.page_count)
741
+ for i in range(pages_to_read):
742
+ raw_text = doc[i].get_text("raw")
743
+ text_parts.append(raw_text)
744
+ doc.close()
745
+ except Exception as e:
746
+ logger.error(f"Could not open PDF: {e}")
747
+ return "\n".join(text_parts)
748
+
749
+
750
+ class MineruNoTextProcessor:
751
+ def __init__(self, output_folder: str, gemini_api_key: str):
752
+ self.output_folder = output_folder
753
+ os.makedirs(self.output_folder, exist_ok=True)
754
+ self.layout_model = "doclayout_yolo"
755
+ self.formula_enable = True
756
+ self.table_enable = False
757
+ self.language = "en"
758
+
759
+ self.subtopic_extractor = GeminiTopicExtractor(api_key=gemini_api_key, num_pages=20)
760
+ self.gemini_api_key = gemini_api_key or os.getenv("GEMINI_API_KEY", "")
761
+
762
+ self.use_s3 = True
763
+ self.s3_writer = s3Writer(
764
+ ak=os.getenv("S3_ACCESS_KEY"),
765
+ sk=os.getenv("S3_SECRET_KEY"),
766
+ bucket="quextro-resources",
767
+ endpoint_url=os.getenv("S3_ENDPOINT")
768
+ )
769
+
770
+ def cleanup_gpu(self):
771
+ try:
772
+ gc.collect()
773
+ torch.cuda.empty_cache()
774
+ logger.info("GPU memory cleaned up.")
775
+ except Exception as e:
776
+ logger.error(f"Error during GPU cleanup: {e}")
777
+
778
+ def process(self, pdf_path: str) -> Dict[str, Any]:
779
+ logger.info(f"Processing PDF: {pdf_path}")
780
+ try:
781
+ # 1) Possibly call subtopic_extractor on first pages to find subtopics in the PDF as a whole
782
+ subtopics = self.subtopic_extractor.extract_subtopics(pdf_path)
783
+ logger.info(f"Gemini returned subtopics: {subtopics}")
784
+
785
+ if pdf_path.startswith("http://") or pdf_path.startswith("https://"):
786
+ response = requests.get(pdf_path)
787
+ if response.status_code != 200:
788
+ logger.error("Failed to download PDF from %s. Status code: %d", pdf_path, response.status_code)
789
+ raise Exception(f"Failed to download PDF: {pdf_path}")
790
+ pdf_bytes = response.content
791
+ logger.info("Downloaded %d bytes for pdf_url='%s'", len(pdf_bytes), pdf_path)
792
+ else:
793
+ with open(pdf_path, "rb") as f:
794
+ pdf_bytes = f.read()
795
+ logger.info("Loaded %d bytes from local file '%s'", len(pdf_bytes), pdf_path)
796
+
797
+ doc = fitz.open(stream=pdf_bytes, filetype="pdf")
798
+ total_pages = doc.page_count
799
+ doc.close()
800
+
801
+ # 3) Decide which pages to process
802
+ final_pages = set()
803
+ if not subtopics:
804
+ # fallback
805
+ final_pages = set(range(total_pages))
806
+ else:
807
+ offset_candidates = []
808
+ for subname, rng in subtopics.items():
809
+ start_p, _ = rng
810
+ occs = find_all_occurrences(pdf_bytes, subname)
811
+ for p in occs:
812
+ candidate = p - (start_p - 1)
813
+ if candidate > 0:
814
+ offset_candidates.append(candidate)
815
+ if offset_candidates:
816
+ try:
817
+ from statistics import mode
818
+ global_offset = mode(offset_candidates)
819
+ except:
820
+ from statistics import median
821
+ global_offset = int(median(offset_candidates))
822
+ else:
823
+ global_offset = 0
824
+
825
+ logger.info(f"Computed global offset: {global_offset}")
826
+ for subname, rng in subtopics.items():
827
+ if not (isinstance(rng, list) and len(rng) == 2):
828
+ continue
829
+ start_p, end_p = rng
830
+ if start_p > end_p:
831
+ continue
832
+ s0 = (start_p - 1) + global_offset
833
+ e0 = (end_p - 1) + global_offset
834
+ for pp in range(s0, e0 + 1):
835
+ final_pages.add(pp)
836
+
837
+ if not final_pages:
838
+ final_pages = set(range(total_pages))
839
+
840
+ logger.info(f"Processing pages (0-based): {sorted(final_pages)}")
841
+ subset_pdf_bytes = create_subset_pdf(pdf_bytes, sorted(final_pages))
842
+
843
+ # 4) Analyze and produce markdown
844
+ dataset = PymuDocDataset(subset_pdf_bytes)
845
+ inference = doc_analyze(
846
+ dataset,
847
+ ocr=True,
848
+ lang=self.language,
849
+ layout_model=self.layout_model,
850
+ formula_enable=self.formula_enable,
851
+ table_enable=self.table_enable
852
+ )
853
+ writer = S3ImageWriter(self.s3_writer, "/topic-extraction", self.gemini_api_key)
854
+ md_prefix = "/topic-extraction/"
855
+ pipe_result = inference.pipe_ocr_mode(writer, lang=self.language)
856
+ md_content = pipe_result.get_markdown(md_prefix)
857
+ final_markdown = writer.post_process(md_prefix, md_content)
858
+
859
+ subtopic_list = list(writer.extracted_subtopics.values())
860
+
861
+ out_path = os.path.join(self.output_folder, "final_subtopics.json")
862
+ with open(out_path, "w", encoding="utf-8") as f:
863
+ json.dump(subtopic_list, f, indent=2)
864
+ logger.info(f"Final subtopics JSON saved locally at {out_path}")
865
+
866
+ return {
867
+ "final_markdown": final_markdown,
868
+ "subtopics_extracted": subtopic_list
869
+ }
870
+ finally:
871
+ self.cleanup_gpu()
872
+
873
+ if __name__ == "__main__":
874
+ input_pdf = "/home/user/app/input_output/a-level-pearson-mathematics-specification.pdf"
875
+ output_dir = "/home/user/app/we/we_ars"
876
+ gemini_key = os.getenv("GEMINI_API_KEY", "AIzaSyDtoakpXa2pjJwcQB6TJ5QaXHNSA5JxcrU")
877
+ try:
878
+ processor = MineruNoTextProcessor(output_folder=output_dir, gemini_api_key=gemini_key)
879
+ result = processor.process(input_pdf)
880
+ logger.info("Processing completed successfully.")
881
+ # The result includes final_markdown and subtopics_extracted
882
+ except Exception as e:
883
+ logger.error(f"Processing failed: {e}")
topic_extraction.log CHANGED
The diff for this file is too large to render. See raw diff
 
topic_extraction.py CHANGED
@@ -721,7 +721,7 @@ class MineruNoTextProcessor:
721
  self.cleanup_gpu()
722
 
723
  if __name__ == "__main__":
724
- input_pdf = "/home/user/app/input_output/wjec-gce-maths-spec-from-2017-e.pdf"
725
  output_dir = "/home/user/app/wje"
726
  gemini_key = os.getenv("GEMINI_API_KEY", "AIzaSyDtoakpXa2pjJwcQB6TJ5QaXHNSA5JxcrU")
727
  try:
 
721
  self.cleanup_gpu()
722
 
723
  if __name__ == "__main__":
724
+ input_pdf = "/home/user/app/input_output/a-level-pearson-mathematics-specification.pdf"
725
  output_dir = "/home/user/app/wje"
726
  gemini_key = os.getenv("GEMINI_API_KEY", "AIzaSyDtoakpXa2pjJwcQB6TJ5QaXHNSA5JxcrU")
727
  try:
topic_extraction_ars.log ADDED
@@ -0,0 +1,460 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-03-03 15:45:38,171 [INFO] __main__ - Processing PDF: /home/user/app/input_output/a-level-pearson-mathematics-specification.pdf
2
+ 2025-03-03 15:45:38,974 [INFO] __main__ - Gemini returned subtopics: {'Paper 1 and Paper 2: Pure Mathematics': [11, 29], 'Paper 3: Statistics and Mechanics': [30, 40]}
3
+ 2025-03-03 15:45:38,975 [INFO] __main__ - Loaded 1135473 bytes from local file '/home/user/app/input_output/a-level-pearson-mathematics-specification.pdf'
4
+ 2025-03-03 15:45:39,261 [INFO] __main__ - Computed global offset: 4
5
+ 2025-03-03 15:45:39,261 [INFO] __main__ - Processing pages (0-based): [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43]
6
+ 2025-03-03 15:46:34,912 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_1.jpg
7
+ 2025-03-03 15:46:36,964 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_2.jpg
8
+ 2025-03-03 15:46:37,539 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_3.jpg
9
+ 2025-03-03 15:46:38,161 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_4.jpg
10
+ 2025-03-03 15:46:38,703 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_5.jpg
11
+ 2025-03-03 15:46:39,330 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_6.jpg
12
+ 2025-03-03 15:46:39,805 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_7.jpg
13
+ 2025-03-03 15:46:40,281 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_8.jpg
14
+ 2025-03-03 15:46:40,751 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_9.jpg
15
+ 2025-03-03 15:46:41,336 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_10.jpg
16
+ 2025-03-03 15:46:41,773 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_11.jpg
17
+ 2025-03-03 15:46:42,431 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_12.jpg
18
+ 2025-03-03 15:46:42,903 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_13.jpg
19
+ 2025-03-03 15:46:43,490 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_14.jpg
20
+ 2025-03-03 15:46:43,962 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_15.jpg
21
+ 2025-03-03 15:46:44,566 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_16.jpg
22
+ 2025-03-03 15:46:45,155 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_17.jpg
23
+ 2025-03-03 15:46:45,448 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_18.jpg
24
+ 2025-03-03 15:46:45,896 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_19.jpg
25
+ 2025-03-03 15:46:46,485 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_20.jpg
26
+ 2025-03-03 15:46:47,081 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_21.jpg
27
+ 2025-03-03 15:46:47,652 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_22.jpg
28
+ 2025-03-03 15:46:48,109 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_23.jpg
29
+ 2025-03-03 15:46:48,593 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_24.jpg
30
+ 2025-03-03 15:46:49,101 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_25.jpg
31
+ 2025-03-03 15:46:49,644 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_26.jpg
32
+ 2025-03-03 15:46:50,274 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_27.jpg
33
+ 2025-03-03 15:46:50,891 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_28.jpg
34
+ 2025-03-03 15:46:51,327 [INFO] __main__ - Classifying images to detect tables.
35
+ 2025-03-03 15:46:55,176 [INFO] __main__ - Processing table image: /topic-extraction/img_1.jpg, columns=three
36
+ 2025-03-03 15:46:58,654 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_1.jpg_r0_c0.jpg
37
+ 2025-03-03 15:46:58,952 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_1.jpg_r0_c1.jpg
38
+ 2025-03-03 15:46:59,179 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_1.jpg_r1_c0.jpg
39
+ 2025-03-03 15:46:59,433 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_1.jpg_r1_c1.jpg
40
+ 2025-03-03 15:46:59,434 [INFO] __main__ - Processing table image: /topic-extraction/img_2.jpg, columns=three
41
+ 2025-03-03 15:47:02,885 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_2.jpg_r0_c0.jpg
42
+ 2025-03-03 15:47:03,187 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_2.jpg_r0_c1.jpg
43
+ 2025-03-03 15:47:03,419 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_2.jpg_r1_c0.jpg
44
+ 2025-03-03 15:47:03,657 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_2.jpg_r2_c0.jpg
45
+ 2025-03-03 15:47:03,872 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_2.jpg_r3_c0.jpg
46
+ 2025-03-03 15:47:03,873 [INFO] __main__ - Processing table image: /topic-extraction/img_3.jpg, columns=three
47
+ 2025-03-03 15:47:07,421 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_3.jpg_r0_c0.jpg
48
+ 2025-03-03 15:47:07,712 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_3.jpg_r0_c1.jpg
49
+ 2025-03-03 15:47:07,918 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_3.jpg_r1_c0.jpg
50
+ 2025-03-03 15:47:07,918 [INFO] __main__ - Processing table image: /topic-extraction/img_4.jpg, columns=three
51
+ 2025-03-03 15:47:11,395 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_4.jpg_r0_c0.jpg
52
+ 2025-03-03 15:47:11,689 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_4.jpg_r0_c1.jpg
53
+ 2025-03-03 15:47:11,904 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_4.jpg_r1_c0.jpg
54
+ 2025-03-03 15:47:12,137 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_4.jpg_r1_c1.jpg
55
+ 2025-03-03 15:47:12,138 [INFO] __main__ - Processing table image: /topic-extraction/img_5.jpg, columns=three
56
+ 2025-03-03 15:47:15,853 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_5.jpg_r0_c0.jpg
57
+ 2025-03-03 15:47:16,176 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_5.jpg_r0_c1.jpg
58
+ 2025-03-03 15:47:16,379 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_5.jpg_r1_c0.jpg
59
+ 2025-03-03 15:47:16,611 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_5.jpg_r1_c1.jpg
60
+ 2025-03-03 15:47:16,850 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_5.jpg_r2_c0.jpg
61
+ 2025-03-03 15:47:16,850 [INFO] __main__ - Processing table image: /topic-extraction/img_6.jpg, columns=three
62
+ 2025-03-03 15:47:20,810 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_6.jpg_r0_c0.jpg
63
+ 2025-03-03 15:47:21,101 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_6.jpg_r0_c1.jpg
64
+ 2025-03-03 15:47:21,322 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_6.jpg_r1_c0.jpg
65
+ 2025-03-03 15:47:21,549 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_6.jpg_r1_c1.jpg
66
+ 2025-03-03 15:47:21,549 [INFO] __main__ - Processing table image: /topic-extraction/img_7.jpg, columns=three
67
+ 2025-03-03 15:47:25,075 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_7.jpg_r0_c0.jpg
68
+ 2025-03-03 15:47:25,405 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_7.jpg_r0_c1.jpg
69
+ 2025-03-03 15:47:25,599 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_7.jpg_r1_c0.jpg
70
+ 2025-03-03 15:47:25,823 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_7.jpg_r2_c0.jpg
71
+ 2025-03-03 15:47:26,054 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_7.jpg_r2_c1.jpg
72
+ 2025-03-03 15:47:26,054 [INFO] __main__ - Processing table image: /topic-extraction/img_8.jpg, columns=three
73
+ 2025-03-03 15:47:29,662 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r0_c0.jpg
74
+ 2025-03-03 15:47:29,944 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r0_c1.jpg
75
+ 2025-03-03 15:47:30,160 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r0_c2.jpg
76
+ 2025-03-03 15:47:30,354 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r1_c0.jpg
77
+ 2025-03-03 15:47:30,586 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r1_c1.jpg
78
+ 2025-03-03 15:47:30,801 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r1_c2.jpg
79
+ 2025-03-03 15:47:31,028 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r2_c0.jpg
80
+ 2025-03-03 15:47:31,232 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r2_c1.jpg
81
+ 2025-03-03 15:47:31,461 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r3_c0.jpg
82
+ 2025-03-03 15:47:31,654 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r3_c1.jpg
83
+ 2025-03-03 15:47:31,912 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r4_c0.jpg
84
+ 2025-03-03 15:47:32,139 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r4_c1.jpg
85
+ 2025-03-03 15:47:32,345 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r5_c0.jpg
86
+ 2025-03-03 15:47:32,586 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r5_c1.jpg
87
+ 2025-03-03 15:47:32,587 [INFO] __main__ - Processing table image: /topic-extraction/img_9.jpg, columns=three
88
+ 2025-03-03 15:47:36,350 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_9.jpg_r0_c0.jpg
89
+ 2025-03-03 15:47:36,676 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_9.jpg_r0_c1.jpg
90
+ 2025-03-03 15:47:36,893 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_9.jpg_r0_c2.jpg
91
+ 2025-03-03 15:47:37,141 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_9.jpg_r1_c0.jpg
92
+ 2025-03-03 15:47:37,374 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_9.jpg_r1_c1.jpg
93
+ 2025-03-03 15:47:37,565 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_9.jpg_r2_c0.jpg
94
+ 2025-03-03 15:47:37,760 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_9.jpg_r2_c1.jpg
95
+ 2025-03-03 15:47:38,012 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_9.jpg_r3_c0.jpg
96
+ 2025-03-03 15:47:38,226 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_9.jpg_r3_c1.jpg
97
+ 2025-03-03 15:47:38,226 [INFO] __main__ - Processing table image: /topic-extraction/img_10.jpg, columns=three
98
+ 2025-03-03 15:47:42,402 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_10.jpg_r0_c0.jpg
99
+ 2025-03-03 15:47:42,675 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_10.jpg_r0_c1.jpg
100
+ 2025-03-03 15:47:42,917 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_10.jpg_r1_c0.jpg
101
+ 2025-03-03 15:47:43,133 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_10.jpg_r2_c0.jpg
102
+ 2025-03-03 15:47:43,355 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_10.jpg_r3_c0.jpg
103
+ 2025-03-03 15:47:43,355 [INFO] __main__ - Processing table image: /topic-extraction/img_11.jpg, columns=two
104
+ 2025-03-03 15:47:48,037 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_11.jpg_r0_c0.jpg
105
+ 2025-03-03 15:47:48,332 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_11.jpg_r1_c0.jpg
106
+ 2025-03-03 15:47:48,540 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_11.jpg_r2_c0.jpg
107
+ 2025-03-03 15:47:48,786 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_11.jpg_r3_c0.jpg
108
+ 2025-03-03 15:47:49,037 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_11.jpg_r4_c0.jpg
109
+ 2025-03-03 15:47:49,264 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_11.jpg_r5_c0.jpg
110
+ 2025-03-03 15:47:49,264 [INFO] __main__ - Processing table image: /topic-extraction/img_12.jpg, columns=three
111
+ 2025-03-03 15:47:53,266 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_12.jpg_r0_c0.jpg
112
+ 2025-03-03 15:47:53,598 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_12.jpg_r0_c1.jpg
113
+ 2025-03-03 15:47:53,819 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_12.jpg_r1_c0.jpg
114
+ 2025-03-03 15:47:54,034 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_12.jpg_r1_c1.jpg
115
+ 2025-03-03 15:47:54,250 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_12.jpg_r2_c0.jpg
116
+ 2025-03-03 15:47:54,474 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_12.jpg_r2_c1.jpg
117
+ 2025-03-03 15:47:54,474 [INFO] __main__ - Processing table image: /topic-extraction/img_13.jpg, columns=three
118
+ 2025-03-03 15:47:57,779 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_13.jpg_r0_c0.jpg
119
+ 2025-03-03 15:47:58,103 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_13.jpg_r0_c1.jpg
120
+ 2025-03-03 15:47:58,326 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_13.jpg_r1_c0.jpg
121
+ 2025-03-03 15:47:58,545 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_13.jpg_r1_c1.jpg
122
+ 2025-03-03 15:47:58,738 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_13.jpg_r2_c0.jpg
123
+ 2025-03-03 15:47:58,994 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_13.jpg_r3_c0.jpg
124
+ 2025-03-03 15:47:58,994 [INFO] __main__ - Processing table image: /topic-extraction/img_14.jpg, columns=three
125
+ 2025-03-03 15:48:03,866 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_14.jpg_r0_c0.jpg
126
+ 2025-03-03 15:48:04,164 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_14.jpg_r0_c1.jpg
127
+ 2025-03-03 15:48:04,382 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_14.jpg_r1_c0.jpg
128
+ 2025-03-03 15:48:04,605 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_14.jpg_r1_c1.jpg
129
+ 2025-03-03 15:48:04,799 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_14.jpg_r2_c0.jpg
130
+ 2025-03-03 15:48:05,032 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_14.jpg_r3_c0.jpg
131
+ 2025-03-03 15:48:05,247 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_14.jpg_r4_c0.jpg
132
+ 2025-03-03 15:48:05,493 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_14.jpg_r4_c1.jpg
133
+ 2025-03-03 15:48:05,710 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_14.jpg_r5_c0.jpg
134
+ 2025-03-03 15:48:05,711 [INFO] __main__ - Processing table image: /topic-extraction/img_15.jpg, columns=three
135
+ 2025-03-03 15:48:09,411 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_15.jpg_r0_c0.jpg
136
+ 2025-03-03 15:48:09,698 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_15.jpg_r0_c1.jpg
137
+ 2025-03-03 15:48:09,923 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_15.jpg_r1_c0.jpg
138
+ 2025-03-03 15:48:10,113 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_15.jpg_r1_c1.jpg
139
+ 2025-03-03 15:48:10,361 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_15.jpg_r2_c0.jpg
140
+ 2025-03-03 15:48:10,587 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_15.jpg_r3_c0.jpg
141
+ 2025-03-03 15:48:10,799 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_15.jpg_r4_c0.jpg
142
+ 2025-03-03 15:48:10,800 [INFO] __main__ - Processing table image: /topic-extraction/img_16.jpg, columns=three
143
+ 2025-03-03 15:48:14,668 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_16.jpg_r0_c0.jpg
144
+ 2025-03-03 15:48:14,969 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_16.jpg_r0_c1.jpg
145
+ 2025-03-03 15:48:15,207 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_16.jpg_r1_c0.jpg
146
+ 2025-03-03 15:48:15,414 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_16.jpg_r1_c1.jpg
147
+ 2025-03-03 15:48:15,634 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_16.jpg_r2_c0.jpg
148
+ 2025-03-03 15:48:15,893 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_16.jpg_r3_c0.jpg
149
+ 2025-03-03 15:48:16,111 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_16.jpg_r3_c1.jpg
150
+ 2025-03-03 15:48:16,343 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_16.jpg_r4_c0.jpg
151
+ 2025-03-03 15:48:17,176 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_16.jpg_r5_c0.jpg
152
+ 2025-03-03 15:48:17,176 [INFO] __main__ - Processing table image: /topic-extraction/img_17.jpg, columns=three
153
+ 2025-03-03 15:48:20,954 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_17.jpg_r0_c0.jpg
154
+ 2025-03-03 15:48:21,213 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_17.jpg_r0_c1.jpg
155
+ 2025-03-03 15:48:21,423 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_17.jpg_r1_c0.jpg
156
+ 2025-03-03 15:48:21,634 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_17.jpg_r2_c0.jpg
157
+ 2025-03-03 15:48:21,832 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_17.jpg_r2_c1.jpg
158
+ 2025-03-03 15:48:22,056 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_17.jpg_r3_c0.jpg
159
+ 2025-03-03 15:48:22,261 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_17.jpg_r4_c0.jpg
160
+ 2025-03-03 15:48:22,481 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_17.jpg_r5_c0.jpg
161
+ 2025-03-03 15:48:22,482 [INFO] __main__ - Processing table image: /topic-extraction/img_18.jpg, columns=three
162
+ 2025-03-03 15:48:23,665 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_18.jpg_r0_c0.jpg
163
+ 2025-03-03 15:48:23,852 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_18.jpg_r0_c1.jpg
164
+ 2025-03-03 15:48:24,035 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_18.jpg_r1_c0.jpg
165
+ 2025-03-03 15:48:24,219 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_18.jpg_r1_c1.jpg
166
+ 2025-03-03 15:48:24,219 [INFO] __main__ - Processing table image: /topic-extraction/img_19.jpg, columns=three
167
+ 2025-03-03 15:48:27,206 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_19.jpg_r0_c0.jpg
168
+ 2025-03-03 15:48:27,482 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_19.jpg_r0_c1.jpg
169
+ 2025-03-03 15:48:27,693 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_19.jpg_r1_c0.jpg
170
+ 2025-03-03 15:48:27,924 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_19.jpg_r1_c1.jpg
171
+ 2025-03-03 15:48:28,131 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_19.jpg_r2_c0.jpg
172
+ 2025-03-03 15:48:28,337 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_19.jpg_r2_c1.jpg
173
+ 2025-03-03 15:48:28,338 [INFO] __main__ - Processing table image: /topic-extraction/img_20.jpg, columns=three
174
+ 2025-03-03 15:48:32,733 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_20.jpg_r0_c0.jpg
175
+ 2025-03-03 15:48:32,995 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_20.jpg_r0_c1.jpg
176
+ 2025-03-03 15:48:33,221 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_20.jpg_r1_c0.jpg
177
+ 2025-03-03 15:48:33,449 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_20.jpg_r1_c1.jpg
178
+ 2025-03-03 15:48:33,449 [INFO] __main__ - Processing table image: /topic-extraction/img_21.jpg, columns=three
179
+ 2025-03-03 15:48:37,495 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_21.jpg_r0_c0.jpg
180
+ 2025-03-03 15:48:37,802 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_21.jpg_r0_c1.jpg
181
+ 2025-03-03 15:48:38,060 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_21.jpg_r1_c0.jpg
182
+ 2025-03-03 15:48:38,267 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_21.jpg_r1_c1.jpg
183
+ 2025-03-03 15:48:38,267 [INFO] __main__ - Processing table image: /topic-extraction/img_22.jpg, columns=three
184
+ 2025-03-03 15:48:42,539 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_22.jpg_r0_c0.jpg
185
+ 2025-03-03 15:48:42,847 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_22.jpg_r0_c1.jpg
186
+ 2025-03-03 15:48:43,064 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_22.jpg_r1_c0.jpg
187
+ 2025-03-03 15:48:43,280 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_22.jpg_r1_c1.jpg
188
+ 2025-03-03 15:48:43,487 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_22.jpg_r2_c0.jpg
189
+ 2025-03-03 15:48:43,716 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_22.jpg_r2_c1.jpg
190
+ 2025-03-03 15:48:43,918 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_22.jpg_r3_c0.jpg
191
+ 2025-03-03 15:48:43,918 [INFO] __main__ - Processing table image: /topic-extraction/img_23.jpg, columns=three
192
+ 2025-03-03 15:48:47,600 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_23.jpg_r0_c0.jpg
193
+ 2025-03-03 15:48:47,900 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_23.jpg_r0_c1.jpg
194
+ 2025-03-03 15:48:48,125 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_23.jpg_r1_c0.jpg
195
+ 2025-03-03 15:48:48,343 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_23.jpg_r1_c1.jpg
196
+ 2025-03-03 15:48:48,343 [INFO] __main__ - Processing table image: /topic-extraction/img_24.jpg, columns=three
197
+ 2025-03-03 15:48:52,065 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_24.jpg_r0_c0.jpg
198
+ 2025-03-03 15:48:52,376 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_24.jpg_r0_c1.jpg
199
+ 2025-03-03 15:48:52,614 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_24.jpg_r1_c0.jpg
200
+ 2025-03-03 15:48:52,870 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_24.jpg_r1_c1.jpg
201
+ 2025-03-03 15:48:53,066 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_24.jpg_r2_c0.jpg
202
+ 2025-03-03 15:48:53,066 [INFO] __main__ - Processing table image: /topic-extraction/img_25.jpg, columns=two
203
+ 2025-03-03 15:48:56,548 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_25.jpg_r0_c0.jpg
204
+ 2025-03-03 15:48:56,863 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_25.jpg_r1_c0.jpg
205
+ 2025-03-03 15:48:57,087 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_25.jpg_r2_c0.jpg
206
+ 2025-03-03 15:48:57,301 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_25.jpg_r3_c0.jpg
207
+ 2025-03-03 15:48:57,526 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_25.jpg_r4_c0.jpg
208
+ 2025-03-03 15:48:57,759 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_25.jpg_r5_c0.jpg
209
+ 2025-03-03 15:48:57,759 [INFO] __main__ - Processing table image: /topic-extraction/img_26.jpg, columns=three
210
+ 2025-03-03 15:49:01,116 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_26.jpg_r0_c0.jpg
211
+ 2025-03-03 15:49:01,407 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_26.jpg_r0_c1.jpg
212
+ 2025-03-03 15:49:01,618 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_26.jpg_r1_c0.jpg
213
+ 2025-03-03 15:49:01,847 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_26.jpg_r2_c0.jpg
214
+ 2025-03-03 15:49:01,847 [INFO] __main__ - Processing table image: /topic-extraction/img_27.jpg, columns=three
215
+ 2025-03-03 15:49:04,977 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_27.jpg_r0_c0.jpg
216
+ 2025-03-03 15:49:05,258 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_27.jpg_r0_c1.jpg
217
+ 2025-03-03 15:49:05,498 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_27.jpg_r1_c0.jpg
218
+ 2025-03-03 15:49:05,712 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_27.jpg_r1_c1.jpg
219
+ 2025-03-03 15:49:05,934 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_27.jpg_r2_c0.jpg
220
+ 2025-03-03 15:49:06,162 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_27.jpg_r3_c0.jpg
221
+ 2025-03-03 15:49:06,385 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_27.jpg_r4_c0.jpg
222
+ 2025-03-03 15:49:06,612 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_27.jpg_r4_c1.jpg
223
+ 2025-03-03 15:49:06,613 [INFO] __main__ - Processing table image: /topic-extraction/img_28.jpg, columns=two
224
+ 2025-03-03 15:49:10,036 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_28.jpg_r0_c0.jpg
225
+ 2025-03-03 15:49:10,328 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_28.jpg_r1_c0.jpg
226
+ 2025-03-03 15:49:10,548 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_28.jpg_r2_c0.jpg
227
+ 2025-03-03 15:49:10,777 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_28.jpg_r3_c0.jpg
228
+ 2025-03-03 15:49:10,780 [INFO] __main__ - Final subtopics JSON saved locally at /home/user/app/we/we_ars/final_subtopics.json
229
+ 2025-03-03 15:49:11,098 [INFO] __main__ - GPU memory cleaned up.
230
+ 2025-03-03 15:49:11,106 [INFO] __main__ - Processing completed successfully.
231
+ 2025-03-03 15:53:27,401 [INFO] __main__ - Processing PDF: /home/user/app/input_output/a-level-pearson-mathematics-specification.pdf
232
+ 2025-03-03 15:53:28,230 [INFO] __main__ - Gemini returned subtopics: {'Paper 1 and Paper 2: Pure Mathematics': [11, 29], 'Paper 3: Statistics and Mechanics': [30, 40]}
233
+ 2025-03-03 15:53:28,231 [INFO] __main__ - Loaded 1135473 bytes from local file '/home/user/app/input_output/a-level-pearson-mathematics-specification.pdf'
234
+ 2025-03-03 15:53:28,557 [INFO] __main__ - Computed global offset: 4
235
+ 2025-03-03 15:53:28,557 [INFO] __main__ - Processing pages (0-based): [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43]
236
+ 2025-03-03 15:54:23,423 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_1.jpg
237
+ 2025-03-03 15:54:25,210 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_2.jpg
238
+ 2025-03-03 15:54:25,742 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_3.jpg
239
+ 2025-03-03 15:54:26,250 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_4.jpg
240
+ 2025-03-03 15:54:26,794 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_5.jpg
241
+ 2025-03-03 15:54:27,347 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_6.jpg
242
+ 2025-03-03 15:54:27,803 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_7.jpg
243
+ 2025-03-03 15:54:28,391 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_8.jpg
244
+ 2025-03-03 15:54:28,891 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_9.jpg
245
+ 2025-03-03 15:54:29,437 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_10.jpg
246
+ 2025-03-03 15:54:29,870 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_11.jpg
247
+ 2025-03-03 15:54:30,421 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_12.jpg
248
+ 2025-03-03 15:54:30,852 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_13.jpg
249
+ 2025-03-03 15:54:31,438 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_14.jpg
250
+ 2025-03-03 15:54:32,029 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_15.jpg
251
+ 2025-03-03 15:54:32,600 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_16.jpg
252
+ 2025-03-03 15:54:33,157 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_17.jpg
253
+ 2025-03-03 15:54:33,444 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_18.jpg
254
+ 2025-03-03 15:54:33,920 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_19.jpg
255
+ 2025-03-03 15:54:34,554 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_20.jpg
256
+ 2025-03-03 15:54:35,147 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_21.jpg
257
+ 2025-03-03 15:54:35,680 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_22.jpg
258
+ 2025-03-03 15:54:36,094 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_23.jpg
259
+ 2025-03-03 15:54:36,554 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_24.jpg
260
+ 2025-03-03 15:54:37,089 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_25.jpg
261
+ 2025-03-03 15:54:37,502 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_26.jpg
262
+ 2025-03-03 15:54:38,008 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_27.jpg
263
+ 2025-03-03 15:54:38,585 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_28.jpg
264
+ 2025-03-03 15:54:39,068 [INFO] __main__ - Classifying images to detect tables.
265
+ 2025-03-03 15:54:42,753 [INFO] __main__ - Processing table image: /topic-extraction/img_1.jpg, columns=three
266
+ 2025-03-03 15:54:46,419 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_1.jpg_r0_c0.jpg
267
+ 2025-03-03 15:54:46,711 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_1.jpg_r0_c1.jpg
268
+ 2025-03-03 15:54:46,896 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_1.jpg_r1_c0.jpg
269
+ 2025-03-03 15:54:47,110 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_1.jpg_r1_c1.jpg
270
+ 2025-03-03 15:54:47,110 [INFO] __main__ - Processing table image: /topic-extraction/img_2.jpg, columns=three
271
+ 2025-03-03 15:54:50,464 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_2.jpg_r0_c0.jpg
272
+ 2025-03-03 15:54:50,784 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_2.jpg_r0_c1.jpg
273
+ 2025-03-03 15:54:50,976 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_2.jpg_r1_c0.jpg
274
+ 2025-03-03 15:54:51,228 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_2.jpg_r2_c0.jpg
275
+ 2025-03-03 15:54:51,462 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_2.jpg_r3_c0.jpg
276
+ 2025-03-03 15:54:51,463 [INFO] __main__ - Processing table image: /topic-extraction/img_3.jpg, columns=three
277
+ 2025-03-03 15:54:55,079 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_3.jpg_r0_c0.jpg
278
+ 2025-03-03 15:54:55,364 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_3.jpg_r0_c1.jpg
279
+ 2025-03-03 15:54:55,570 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_3.jpg_r1_c0.jpg
280
+ 2025-03-03 15:54:55,571 [INFO] __main__ - Processing table image: /topic-extraction/img_4.jpg, columns=three
281
+ 2025-03-03 15:54:58,838 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_4.jpg_r0_c0.jpg
282
+ 2025-03-03 15:54:59,144 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_4.jpg_r0_c1.jpg
283
+ 2025-03-03 15:54:59,326 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_4.jpg_r1_c0.jpg
284
+ 2025-03-03 15:54:59,577 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_4.jpg_r1_c1.jpg
285
+ 2025-03-03 15:54:59,578 [INFO] __main__ - Processing table image: /topic-extraction/img_5.jpg, columns=three
286
+ 2025-03-03 15:55:03,518 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_5.jpg_r0_c0.jpg
287
+ 2025-03-03 15:55:03,801 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_5.jpg_r0_c1.jpg
288
+ 2025-03-03 15:55:03,983 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_5.jpg_r1_c0.jpg
289
+ 2025-03-03 15:55:04,202 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_5.jpg_r1_c1.jpg
290
+ 2025-03-03 15:55:04,417 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_5.jpg_r2_c0.jpg
291
+ 2025-03-03 15:55:04,417 [INFO] __main__ - Processing table image: /topic-extraction/img_6.jpg, columns=three
292
+ 2025-03-03 15:55:08,109 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_6.jpg_r0_c0.jpg
293
+ 2025-03-03 15:55:08,423 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_6.jpg_r0_c1.jpg
294
+ 2025-03-03 15:55:08,629 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_6.jpg_r1_c0.jpg
295
+ 2025-03-03 15:55:08,816 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_6.jpg_r1_c1.jpg
296
+ 2025-03-03 15:55:08,816 [INFO] __main__ - Processing table image: /topic-extraction/img_7.jpg, columns=three
297
+ 2025-03-03 15:55:12,344 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_7.jpg_r0_c0.jpg
298
+ 2025-03-03 15:55:12,644 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_7.jpg_r0_c1.jpg
299
+ 2025-03-03 15:55:12,867 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_7.jpg_r1_c0.jpg
300
+ 2025-03-03 15:55:13,114 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_7.jpg_r2_c0.jpg
301
+ 2025-03-03 15:55:13,343 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_7.jpg_r2_c1.jpg
302
+ 2025-03-03 15:55:13,344 [INFO] __main__ - Processing table image: /topic-extraction/img_8.jpg, columns=three
303
+ 2025-03-03 15:55:16,823 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r0_c0.jpg
304
+ 2025-03-03 15:55:17,140 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r0_c1.jpg
305
+ 2025-03-03 15:55:17,422 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r0_c2.jpg
306
+ 2025-03-03 15:55:17,706 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r1_c0.jpg
307
+ 2025-03-03 15:55:18,019 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r1_c1.jpg
308
+ 2025-03-03 15:55:18,320 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r1_c2.jpg
309
+ 2025-03-03 15:55:18,619 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r2_c0.jpg
310
+ 2025-03-03 15:55:18,911 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r2_c1.jpg
311
+ 2025-03-03 15:55:19,208 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r3_c0.jpg
312
+ 2025-03-03 15:55:19,491 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r3_c1.jpg
313
+ 2025-03-03 15:55:19,806 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r4_c0.jpg
314
+ 2025-03-03 15:55:20,093 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r4_c1.jpg
315
+ 2025-03-03 15:55:20,406 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r5_c0.jpg
316
+ 2025-03-03 15:55:20,689 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r5_c1.jpg
317
+ 2025-03-03 15:55:20,690 [INFO] __main__ - Processing table image: /topic-extraction/img_9.jpg, columns=three
318
+ 2025-03-03 15:55:24,558 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_9.jpg_r0_c0.jpg
319
+ 2025-03-03 15:55:24,859 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_9.jpg_r0_c1.jpg
320
+ 2025-03-03 15:55:25,142 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_9.jpg_r0_c2.jpg
321
+ 2025-03-03 15:55:25,422 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_9.jpg_r1_c0.jpg
322
+ 2025-03-03 15:55:25,738 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_9.jpg_r1_c1.jpg
323
+ 2025-03-03 15:55:26,031 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_9.jpg_r2_c0.jpg
324
+ 2025-03-03 15:55:26,335 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_9.jpg_r2_c1.jpg
325
+ 2025-03-03 15:55:26,616 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_9.jpg_r3_c0.jpg
326
+ 2025-03-03 15:55:26,908 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_9.jpg_r3_c1.jpg
327
+ 2025-03-03 15:55:26,909 [INFO] __main__ - Processing table image: /topic-extraction/img_10.jpg, columns=three
328
+ 2025-03-03 15:55:30,379 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_10.jpg_r0_c0.jpg
329
+ 2025-03-03 15:55:30,667 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_10.jpg_r0_c1.jpg
330
+ 2025-03-03 15:55:30,961 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_10.jpg_r1_c0.jpg
331
+ 2025-03-03 15:55:31,248 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_10.jpg_r2_c0.jpg
332
+ 2025-03-03 15:55:31,547 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_10.jpg_r3_c0.jpg
333
+ 2025-03-03 15:55:31,549 [INFO] __main__ - Processing table image: /topic-extraction/img_11.jpg, columns=two
334
+ 2025-03-03 15:55:34,706 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_11.jpg_r0_c0.jpg
335
+ 2025-03-03 15:55:34,994 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_11.jpg_r1_c0.jpg
336
+ 2025-03-03 15:55:35,254 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_11.jpg_r2_c0.jpg
337
+ 2025-03-03 15:55:35,558 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_11.jpg_r3_c0.jpg
338
+ 2025-03-03 15:55:35,852 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_11.jpg_r4_c0.jpg
339
+ 2025-03-03 15:55:36,137 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_11.jpg_r5_c0.jpg
340
+ 2025-03-03 15:55:36,137 [INFO] __main__ - Processing table image: /topic-extraction/img_12.jpg, columns=three
341
+ 2025-03-03 15:55:39,497 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_12.jpg_r0_c0.jpg
342
+ 2025-03-03 15:55:39,757 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_12.jpg_r0_c1.jpg
343
+ 2025-03-03 15:55:40,062 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_12.jpg_r1_c0.jpg
344
+ 2025-03-03 15:55:40,345 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_12.jpg_r1_c1.jpg
345
+ 2025-03-03 15:55:40,666 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_12.jpg_r2_c0.jpg
346
+ 2025-03-03 15:55:40,976 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_12.jpg_r2_c1.jpg
347
+ 2025-03-03 15:55:40,977 [INFO] __main__ - Processing table image: /topic-extraction/img_13.jpg, columns=three
348
+ 2025-03-03 15:55:44,159 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_13.jpg_r0_c0.jpg
349
+ 2025-03-03 15:55:44,436 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_13.jpg_r0_c1.jpg
350
+ 2025-03-03 15:55:44,643 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_13.jpg_r1_c0.jpg
351
+ 2025-03-03 15:55:44,853 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_13.jpg_r1_c1.jpg
352
+ 2025-03-03 15:55:45,041 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_13.jpg_r2_c0.jpg
353
+ 2025-03-03 15:55:45,254 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_13.jpg_r3_c0.jpg
354
+ 2025-03-03 15:55:45,255 [INFO] __main__ - Processing table image: /topic-extraction/img_14.jpg, columns=three
355
+ 2025-03-03 15:55:49,508 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_14.jpg_r0_c0.jpg
356
+ 2025-03-03 15:55:49,786 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_14.jpg_r0_c1.jpg
357
+ 2025-03-03 15:55:50,075 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_14.jpg_r1_c0.jpg
358
+ 2025-03-03 15:55:50,355 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_14.jpg_r1_c1.jpg
359
+ 2025-03-03 15:55:50,647 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_14.jpg_r2_c0.jpg
360
+ 2025-03-03 15:55:50,978 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_14.jpg_r3_c0.jpg
361
+ 2025-03-03 15:55:51,295 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_14.jpg_r4_c0.jpg
362
+ 2025-03-03 15:55:51,582 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_14.jpg_r4_c1.jpg
363
+ 2025-03-03 15:55:51,855 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_14.jpg_r5_c0.jpg
364
+ 2025-03-03 15:55:51,856 [INFO] __main__ - Processing table image: /topic-extraction/img_15.jpg, columns=three
365
+ 2025-03-03 15:55:55,882 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_15.jpg_r0_c0.jpg
366
+ 2025-03-03 15:55:56,182 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_15.jpg_r0_c1.jpg
367
+ 2025-03-03 15:55:56,463 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_15.jpg_r1_c0.jpg
368
+ 2025-03-03 15:55:56,727 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_15.jpg_r1_c1.jpg
369
+ 2025-03-03 15:55:57,005 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_15.jpg_r2_c0.jpg
370
+ 2025-03-03 15:55:57,301 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_15.jpg_r3_c0.jpg
371
+ 2025-03-03 15:55:57,584 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_15.jpg_r4_c0.jpg
372
+ 2025-03-03 15:55:57,584 [INFO] __main__ - Processing table image: /topic-extraction/img_16.jpg, columns=three
373
+ 2025-03-03 15:56:01,615 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_16.jpg_r0_c0.jpg
374
+ 2025-03-03 15:56:01,906 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_16.jpg_r0_c1.jpg
375
+ 2025-03-03 15:56:02,222 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_16.jpg_r1_c0.jpg
376
+ 2025-03-03 15:56:02,513 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_16.jpg_r1_c1.jpg
377
+ 2025-03-03 15:56:02,801 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_16.jpg_r2_c0.jpg
378
+ 2025-03-03 15:56:03,083 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_16.jpg_r3_c0.jpg
379
+ 2025-03-03 15:56:03,393 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_16.jpg_r3_c1.jpg
380
+ 2025-03-03 15:56:03,676 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_16.jpg_r4_c0.jpg
381
+ 2025-03-03 15:56:04,667 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_16.jpg_r5_c0.jpg
382
+ 2025-03-03 15:56:04,667 [INFO] __main__ - Processing table image: /topic-extraction/img_17.jpg, columns=three
383
+ 2025-03-03 15:56:09,007 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_17.jpg_r0_c0.jpg
384
+ 2025-03-03 15:56:09,286 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_17.jpg_r0_c1.jpg
385
+ 2025-03-03 15:56:09,520 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_17.jpg_r1_c0.jpg
386
+ 2025-03-03 15:56:09,740 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_17.jpg_r2_c0.jpg
387
+ 2025-03-03 15:56:09,947 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_17.jpg_r2_c1.jpg
388
+ 2025-03-03 15:56:10,171 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_17.jpg_r3_c0.jpg
389
+ 2025-03-03 15:56:10,389 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_17.jpg_r4_c0.jpg
390
+ 2025-03-03 15:56:10,610 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_17.jpg_r5_c0.jpg
391
+ 2025-03-03 15:56:10,610 [INFO] __main__ - Processing table image: /topic-extraction/img_18.jpg, columns=three
392
+ 2025-03-03 15:56:11,718 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_18.jpg_r0_c0.jpg
393
+ 2025-03-03 15:56:11,899 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_18.jpg_r0_c1.jpg
394
+ 2025-03-03 15:56:12,081 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_18.jpg_r1_c0.jpg
395
+ 2025-03-03 15:56:12,266 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_18.jpg_r1_c1.jpg
396
+ 2025-03-03 15:56:12,266 [INFO] __main__ - Processing table image: /topic-extraction/img_19.jpg, columns=three
397
+ 2025-03-03 15:56:15,231 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_19.jpg_r0_c0.jpg
398
+ 2025-03-03 15:56:15,582 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_19.jpg_r0_c1.jpg
399
+ 2025-03-03 15:56:15,802 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_19.jpg_r1_c0.jpg
400
+ 2025-03-03 15:56:16,018 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_19.jpg_r1_c1.jpg
401
+ 2025-03-03 15:56:16,234 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_19.jpg_r2_c0.jpg
402
+ 2025-03-03 15:56:16,451 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_19.jpg_r2_c1.jpg
403
+ 2025-03-03 15:56:16,452 [INFO] __main__ - Processing table image: /topic-extraction/img_20.jpg, columns=three
404
+ 2025-03-03 15:56:20,970 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_20.jpg_r0_c0.jpg
405
+ 2025-03-03 15:56:21,300 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_20.jpg_r0_c1.jpg
406
+ 2025-03-03 15:56:21,518 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_20.jpg_r1_c0.jpg
407
+ 2025-03-03 15:56:21,742 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_20.jpg_r1_c1.jpg
408
+ 2025-03-03 15:56:21,742 [INFO] __main__ - Processing table image: /topic-extraction/img_21.jpg, columns=three
409
+ 2025-03-03 15:56:25,577 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_21.jpg_r0_c0.jpg
410
+ 2025-03-03 15:56:25,883 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_21.jpg_r0_c1.jpg
411
+ 2025-03-03 15:56:26,108 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_21.jpg_r1_c0.jpg
412
+ 2025-03-03 15:56:26,319 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_21.jpg_r1_c1.jpg
413
+ 2025-03-03 15:56:26,320 [INFO] __main__ - Processing table image: /topic-extraction/img_22.jpg, columns=three
414
+ 2025-03-03 15:56:30,722 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_22.jpg_r0_c0.jpg
415
+ 2025-03-03 15:56:31,018 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_22.jpg_r0_c1.jpg
416
+ 2025-03-03 15:56:31,267 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_22.jpg_r1_c0.jpg
417
+ 2025-03-03 15:56:31,455 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_22.jpg_r1_c1.jpg
418
+ 2025-03-03 15:56:31,684 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_22.jpg_r2_c0.jpg
419
+ 2025-03-03 15:56:31,904 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_22.jpg_r2_c1.jpg
420
+ 2025-03-03 15:56:32,136 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_22.jpg_r3_c0.jpg
421
+ 2025-03-03 15:56:32,136 [INFO] __main__ - Processing table image: /topic-extraction/img_23.jpg, columns=three
422
+ 2025-03-03 15:56:35,410 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_23.jpg_r0_c0.jpg
423
+ 2025-03-03 15:56:35,689 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_23.jpg_r0_c1.jpg
424
+ 2025-03-03 15:56:35,917 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_23.jpg_r1_c0.jpg
425
+ 2025-03-03 15:56:36,143 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_23.jpg_r1_c1.jpg
426
+ 2025-03-03 15:56:36,144 [INFO] __main__ - Processing table image: /topic-extraction/img_24.jpg, columns=three
427
+ 2025-03-03 15:56:39,869 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_24.jpg_r0_c0.jpg
428
+ 2025-03-03 15:56:40,150 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_24.jpg_r0_c1.jpg
429
+ 2025-03-03 15:56:40,387 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_24.jpg_r1_c0.jpg
430
+ 2025-03-03 15:56:40,608 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_24.jpg_r1_c1.jpg
431
+ 2025-03-03 15:56:40,828 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_24.jpg_r2_c0.jpg
432
+ 2025-03-03 15:56:40,829 [INFO] __main__ - Processing table image: /topic-extraction/img_25.jpg, columns=two
433
+ 2025-03-03 15:56:44,221 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_25.jpg_r0_c0.jpg
434
+ 2025-03-03 15:56:44,522 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_25.jpg_r1_c0.jpg
435
+ 2025-03-03 15:56:44,728 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_25.jpg_r2_c0.jpg
436
+ 2025-03-03 15:56:44,929 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_25.jpg_r3_c0.jpg
437
+ 2025-03-03 15:56:45,153 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_25.jpg_r4_c0.jpg
438
+ 2025-03-03 15:56:45,372 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_25.jpg_r5_c0.jpg
439
+ 2025-03-03 15:56:45,372 [INFO] __main__ - Processing table image: /topic-extraction/img_26.jpg, columns=three
440
+ 2025-03-03 15:56:48,485 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_26.jpg_r0_c0.jpg
441
+ 2025-03-03 15:56:48,806 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_26.jpg_r0_c1.jpg
442
+ 2025-03-03 15:56:49,036 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_26.jpg_r1_c0.jpg
443
+ 2025-03-03 15:56:49,282 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_26.jpg_r2_c0.jpg
444
+ 2025-03-03 15:56:49,282 [INFO] __main__ - Processing table image: /topic-extraction/img_27.jpg, columns=three
445
+ 2025-03-03 15:56:52,374 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_27.jpg_r0_c0.jpg
446
+ 2025-03-03 15:56:52,664 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_27.jpg_r0_c1.jpg
447
+ 2025-03-03 15:56:52,887 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_27.jpg_r1_c0.jpg
448
+ 2025-03-03 15:56:53,103 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_27.jpg_r1_c1.jpg
449
+ 2025-03-03 15:56:53,329 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_27.jpg_r2_c0.jpg
450
+ 2025-03-03 15:56:53,543 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_27.jpg_r3_c0.jpg
451
+ 2025-03-03 15:56:53,759 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_27.jpg_r4_c0.jpg
452
+ 2025-03-03 15:56:53,978 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_27.jpg_r4_c1.jpg
453
+ 2025-03-03 15:56:53,979 [INFO] __main__ - Processing table image: /topic-extraction/img_28.jpg, columns=two
454
+ 2025-03-03 15:56:57,389 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_28.jpg_r0_c0.jpg
455
+ 2025-03-03 15:56:57,690 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_28.jpg_r1_c0.jpg
456
+ 2025-03-03 15:56:57,897 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_28.jpg_r2_c0.jpg
457
+ 2025-03-03 15:56:58,126 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_28.jpg_r3_c0.jpg
458
+ 2025-03-03 15:56:58,131 [INFO] __main__ - Final subtopics JSON saved locally at /home/user/app/we/we_ars/final_subtopics.json
459
+ 2025-03-03 15:56:58,438 [INFO] __main__ - GPU memory cleaned up.
460
+ 2025-03-03 15:56:58,445 [INFO] __main__ - Processing completed successfully.
we/final_subtopics.json CHANGED
@@ -1 +1,282 @@
1
- []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "title": "",
4
+ "contents": [
5
+ {
6
+ "type": "image",
7
+ "key": "/topic-extraction/img_1.jpg"
8
+ }
9
+ ],
10
+ "children": []
11
+ },
12
+ {
13
+ "title": "",
14
+ "contents": [
15
+ {
16
+ "type": "image",
17
+ "key": "/topic-extraction/img_2.jpg"
18
+ }
19
+ ],
20
+ "children": []
21
+ },
22
+ {
23
+ "title": "",
24
+ "contents": [
25
+ {
26
+ "type": "image",
27
+ "key": "/topic-extraction/img_3.jpg"
28
+ }
29
+ ],
30
+ "children": []
31
+ },
32
+ {
33
+ "title": "",
34
+ "contents": [
35
+ {
36
+ "type": "image",
37
+ "key": "/topic-extraction/img_4.jpg"
38
+ }
39
+ ],
40
+ "children": []
41
+ },
42
+ {
43
+ "title": "",
44
+ "contents": [
45
+ {
46
+ "type": "image",
47
+ "key": "/topic-extraction/img_5.jpg"
48
+ }
49
+ ],
50
+ "children": []
51
+ },
52
+ {
53
+ "title": "",
54
+ "contents": [
55
+ {
56
+ "type": "image",
57
+ "key": "/topic-extraction/img_6.jpg"
58
+ }
59
+ ],
60
+ "children": []
61
+ },
62
+ {
63
+ "title": "",
64
+ "contents": [
65
+ {
66
+ "type": "image",
67
+ "key": "/topic-extraction/img_7.jpg"
68
+ }
69
+ ],
70
+ "children": []
71
+ },
72
+ {
73
+ "title": "",
74
+ "contents": [
75
+ {
76
+ "type": "image",
77
+ "key": "/topic-extraction/img_8.jpg"
78
+ }
79
+ ],
80
+ "children": []
81
+ },
82
+ {
83
+ "title": "",
84
+ "contents": [
85
+ {
86
+ "type": "image",
87
+ "key": "/topic-extraction/img_9.jpg"
88
+ }
89
+ ],
90
+ "children": []
91
+ },
92
+ {
93
+ "title": "",
94
+ "contents": [
95
+ {
96
+ "type": "image",
97
+ "key": "/topic-extraction/img_10.jpg"
98
+ }
99
+ ],
100
+ "children": []
101
+ },
102
+ {
103
+ "title": "",
104
+ "contents": [
105
+ {
106
+ "type": "image",
107
+ "key": "/topic-extraction/img_11.jpg"
108
+ }
109
+ ],
110
+ "children": []
111
+ },
112
+ {
113
+ "title": "",
114
+ "contents": [
115
+ {
116
+ "type": "image",
117
+ "key": "/topic-extraction/img_12.jpg"
118
+ }
119
+ ],
120
+ "children": []
121
+ },
122
+ {
123
+ "title": "",
124
+ "contents": [
125
+ {
126
+ "type": "image",
127
+ "key": "/topic-extraction/img_13.jpg"
128
+ }
129
+ ],
130
+ "children": []
131
+ },
132
+ {
133
+ "title": "",
134
+ "contents": [
135
+ {
136
+ "type": "image",
137
+ "key": "/topic-extraction/img_14.jpg"
138
+ }
139
+ ],
140
+ "children": []
141
+ },
142
+ {
143
+ "title": "",
144
+ "contents": [
145
+ {
146
+ "type": "image",
147
+ "key": "/topic-extraction/img_15.jpg"
148
+ }
149
+ ],
150
+ "children": []
151
+ },
152
+ {
153
+ "title": "",
154
+ "contents": [
155
+ {
156
+ "type": "image",
157
+ "key": "/topic-extraction/img_16.jpg"
158
+ }
159
+ ],
160
+ "children": []
161
+ },
162
+ {
163
+ "title": "",
164
+ "contents": [
165
+ {
166
+ "type": "image",
167
+ "key": "/topic-extraction/img_17.jpg"
168
+ }
169
+ ],
170
+ "children": []
171
+ },
172
+ {
173
+ "title": "",
174
+ "contents": [
175
+ {
176
+ "type": "image",
177
+ "key": "/topic-extraction/img_18.jpg"
178
+ }
179
+ ],
180
+ "children": []
181
+ },
182
+ {
183
+ "title": "",
184
+ "contents": [
185
+ {
186
+ "type": "image",
187
+ "key": "/topic-extraction/img_19.jpg"
188
+ }
189
+ ],
190
+ "children": []
191
+ },
192
+ {
193
+ "title": "",
194
+ "contents": [
195
+ {
196
+ "type": "image",
197
+ "key": "/topic-extraction/img_20.jpg"
198
+ }
199
+ ],
200
+ "children": []
201
+ },
202
+ {
203
+ "title": "",
204
+ "contents": [
205
+ {
206
+ "type": "image",
207
+ "key": "/topic-extraction/img_21.jpg"
208
+ }
209
+ ],
210
+ "children": []
211
+ },
212
+ {
213
+ "title": "",
214
+ "contents": [
215
+ {
216
+ "type": "image",
217
+ "key": "/topic-extraction/img_22.jpg"
218
+ }
219
+ ],
220
+ "children": []
221
+ },
222
+ {
223
+ "title": "",
224
+ "contents": [
225
+ {
226
+ "type": "image",
227
+ "key": "/topic-extraction/img_23.jpg"
228
+ }
229
+ ],
230
+ "children": []
231
+ },
232
+ {
233
+ "title": "",
234
+ "contents": [
235
+ {
236
+ "type": "image",
237
+ "key": "/topic-extraction/img_24.jpg"
238
+ }
239
+ ],
240
+ "children": []
241
+ },
242
+ {
243
+ "title": "",
244
+ "contents": [
245
+ {
246
+ "type": "image",
247
+ "key": "/topic-extraction/img_25.jpg"
248
+ }
249
+ ],
250
+ "children": []
251
+ },
252
+ {
253
+ "title": "",
254
+ "contents": [
255
+ {
256
+ "type": "image",
257
+ "key": "/topic-extraction/img_26.jpg"
258
+ }
259
+ ],
260
+ "children": []
261
+ },
262
+ {
263
+ "title": "",
264
+ "contents": [
265
+ {
266
+ "type": "image",
267
+ "key": "/topic-extraction/img_27.jpg"
268
+ }
269
+ ],
270
+ "children": []
271
+ },
272
+ {
273
+ "title": "",
274
+ "contents": [
275
+ {
276
+ "type": "image",
277
+ "key": "/topic-extraction/img_28.jpg"
278
+ }
279
+ ],
280
+ "children": []
281
+ }
282
+ ]
we/we_ars/final_subtopics.json ADDED
@@ -0,0 +1,282 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "title": null,
4
+ "contents": [
5
+ {
6
+ "type": "image",
7
+ "key": "/topic-extraction/img_1.jpg"
8
+ }
9
+ ],
10
+ "children": []
11
+ },
12
+ {
13
+ "title": null,
14
+ "contents": [
15
+ {
16
+ "type": "image",
17
+ "key": "/topic-extraction/img_2.jpg"
18
+ }
19
+ ],
20
+ "children": []
21
+ },
22
+ {
23
+ "title": null,
24
+ "contents": [
25
+ {
26
+ "type": "image",
27
+ "key": "/topic-extraction/img_3.jpg"
28
+ }
29
+ ],
30
+ "children": []
31
+ },
32
+ {
33
+ "title": null,
34
+ "contents": [
35
+ {
36
+ "type": "image",
37
+ "key": "/topic-extraction/img_4.jpg"
38
+ }
39
+ ],
40
+ "children": []
41
+ },
42
+ {
43
+ "title": null,
44
+ "contents": [
45
+ {
46
+ "type": "image",
47
+ "key": "/topic-extraction/img_5.jpg"
48
+ }
49
+ ],
50
+ "children": []
51
+ },
52
+ {
53
+ "title": null,
54
+ "contents": [
55
+ {
56
+ "type": "image",
57
+ "key": "/topic-extraction/img_6.jpg"
58
+ }
59
+ ],
60
+ "children": []
61
+ },
62
+ {
63
+ "title": null,
64
+ "contents": [
65
+ {
66
+ "type": "image",
67
+ "key": "/topic-extraction/img_7.jpg"
68
+ }
69
+ ],
70
+ "children": []
71
+ },
72
+ {
73
+ "title": null,
74
+ "contents": [
75
+ {
76
+ "type": "image",
77
+ "key": "/topic-extraction/img_8.jpg"
78
+ }
79
+ ],
80
+ "children": []
81
+ },
82
+ {
83
+ "title": null,
84
+ "contents": [
85
+ {
86
+ "type": "image",
87
+ "key": "/topic-extraction/img_9.jpg"
88
+ }
89
+ ],
90
+ "children": []
91
+ },
92
+ {
93
+ "title": null,
94
+ "contents": [
95
+ {
96
+ "type": "image",
97
+ "key": "/topic-extraction/img_10.jpg"
98
+ }
99
+ ],
100
+ "children": []
101
+ },
102
+ {
103
+ "title": null,
104
+ "contents": [
105
+ {
106
+ "type": "image",
107
+ "key": "/topic-extraction/img_11.jpg"
108
+ }
109
+ ],
110
+ "children": []
111
+ },
112
+ {
113
+ "title": null,
114
+ "contents": [
115
+ {
116
+ "type": "image",
117
+ "key": "/topic-extraction/img_12.jpg"
118
+ }
119
+ ],
120
+ "children": []
121
+ },
122
+ {
123
+ "title": null,
124
+ "contents": [
125
+ {
126
+ "type": "image",
127
+ "key": "/topic-extraction/img_13.jpg"
128
+ }
129
+ ],
130
+ "children": []
131
+ },
132
+ {
133
+ "title": null,
134
+ "contents": [
135
+ {
136
+ "type": "image",
137
+ "key": "/topic-extraction/img_14.jpg"
138
+ }
139
+ ],
140
+ "children": []
141
+ },
142
+ {
143
+ "title": null,
144
+ "contents": [
145
+ {
146
+ "type": "image",
147
+ "key": "/topic-extraction/img_15.jpg"
148
+ }
149
+ ],
150
+ "children": []
151
+ },
152
+ {
153
+ "title": null,
154
+ "contents": [
155
+ {
156
+ "type": "image",
157
+ "key": "/topic-extraction/img_16.jpg"
158
+ }
159
+ ],
160
+ "children": []
161
+ },
162
+ {
163
+ "title": null,
164
+ "contents": [
165
+ {
166
+ "type": "image",
167
+ "key": "/topic-extraction/img_17.jpg"
168
+ }
169
+ ],
170
+ "children": []
171
+ },
172
+ {
173
+ "title": null,
174
+ "contents": [
175
+ {
176
+ "type": "image",
177
+ "key": "/topic-extraction/img_18.jpg"
178
+ }
179
+ ],
180
+ "children": []
181
+ },
182
+ {
183
+ "title": null,
184
+ "contents": [
185
+ {
186
+ "type": "image",
187
+ "key": "/topic-extraction/img_19.jpg"
188
+ }
189
+ ],
190
+ "children": []
191
+ },
192
+ {
193
+ "title": null,
194
+ "contents": [
195
+ {
196
+ "type": "image",
197
+ "key": "/topic-extraction/img_20.jpg"
198
+ }
199
+ ],
200
+ "children": []
201
+ },
202
+ {
203
+ "title": null,
204
+ "contents": [
205
+ {
206
+ "type": "image",
207
+ "key": "/topic-extraction/img_21.jpg"
208
+ }
209
+ ],
210
+ "children": []
211
+ },
212
+ {
213
+ "title": null,
214
+ "contents": [
215
+ {
216
+ "type": "image",
217
+ "key": "/topic-extraction/img_22.jpg"
218
+ }
219
+ ],
220
+ "children": []
221
+ },
222
+ {
223
+ "title": null,
224
+ "contents": [
225
+ {
226
+ "type": "image",
227
+ "key": "/topic-extraction/img_23.jpg"
228
+ }
229
+ ],
230
+ "children": []
231
+ },
232
+ {
233
+ "title": null,
234
+ "contents": [
235
+ {
236
+ "type": "image",
237
+ "key": "/topic-extraction/img_24.jpg"
238
+ }
239
+ ],
240
+ "children": []
241
+ },
242
+ {
243
+ "title": null,
244
+ "contents": [
245
+ {
246
+ "type": "image",
247
+ "key": "/topic-extraction/img_25.jpg"
248
+ }
249
+ ],
250
+ "children": []
251
+ },
252
+ {
253
+ "title": null,
254
+ "contents": [
255
+ {
256
+ "type": "image",
257
+ "key": "/topic-extraction/img_26.jpg"
258
+ }
259
+ ],
260
+ "children": []
261
+ },
262
+ {
263
+ "title": null,
264
+ "contents": [
265
+ {
266
+ "type": "image",
267
+ "key": "/topic-extraction/img_27.jpg"
268
+ }
269
+ ],
270
+ "children": []
271
+ },
272
+ {
273
+ "title": null,
274
+ "contents": [
275
+ {
276
+ "type": "image",
277
+ "key": "/topic-extraction/img_28.jpg"
278
+ }
279
+ ],
280
+ "children": []
281
+ }
282
+ ]
wje/final_output.json ADDED
@@ -0,0 +1,265 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "subtopics": {
3
+ "Paper 1 and Paper 2: Pure Mathematics": [
4
+ 11,
5
+ 29
6
+ ],
7
+ "Paper 3: Statistics and Mechanics": [
8
+ 30,
9
+ 40
10
+ ]
11
+ },
12
+ "local_images": {
13
+ "e7e5c8f3c0a6316c2b50698c45ebe05b49bfd8bbe47a07b7b1929dd3cfd3e609.jpg": "img_1.jpg",
14
+ "b243ef738ec2465b1cc00f4dd8dd0e5f5e10a91debf7762903ac6c023dd238c4.jpg": "img_2.jpg",
15
+ "5e22a8a8c5bc23ee4d16bda9cce4a6ab4bb53854074fd4d691531d5adb9f3ebe.jpg": "img_3.jpg",
16
+ "0e70645e72eadab75c88846b7947fc1216cf31d325febf02fbdf4898b430465d.jpg": "img_4.jpg",
17
+ "52484f429af5d74ef75e96bf132b15fdc4acd2ed46accb981d670592dcc57ff2.jpg": "img_5.jpg",
18
+ "5a153708e7a128d8f6477cb294d2f902d3a9bc57af709c81ccc3937b96580137.jpg": "img_6.jpg",
19
+ "fd3a52607bef204e6998e09db82d195de76d929399c2cb1a63e26f87054eec6f.jpg": "img_7.jpg",
20
+ "93885318f77c148b9fb1cd162cb9938d6f6cd795d000d5b997f2297198462fcf.jpg": "img_8.jpg",
21
+ "133a659582f49fb71dc5fcae918278e6659a257026e35741ba8e6b94fcdb9de6.jpg": "img_9.jpg",
22
+ "860d10a56a7e892c674f74fd030592339e629fb80d6e6dbfc343f95ec65a7c16.jpg": "img_10.jpg",
23
+ "ae5ee4479ae736ff433ca9b2a1c3f753bbc8cc11a384e27cb710b426757c31e9.jpg": "img_11.jpg",
24
+ "39ac9ccc8cd681e552fb1ae08341b4a2dcb33ea8fe6c787daf99fb993d29e57f.jpg": "img_12.jpg",
25
+ "6d67beb5c0bf2168a87ad6b7c179ff9c7de8bbd7e720f77f7bf206080cceb589.jpg": "img_13.jpg",
26
+ "b89d31200bc06fda181bd2538b5f3274de3e52b0adc7dd023ca676e168e6d487.jpg": "img_14.jpg",
27
+ "78907967ba7a56221a0987e6e696e361c82fcf057f41659e4aa77943a62b6763.jpg": "img_15.jpg",
28
+ "bd3eb31469dd7b72e9773564915dc768e2e152878d887dcab34e83875e0625bb.jpg": "img_16.jpg",
29
+ "f1f1acb21df3d785fa3120fbae5fc74f7064769d9b38524bb991cfaa110177f6.jpg": "img_17.jpg",
30
+ "b8b803d008ec9053c40f4a9c2c265a8a0b15742059331dc7997336c94ab74dc4.jpg": "img_18.jpg",
31
+ "9cbb4e3b89d75d1d5da2fe8c6ccc4c1d3f612779abaccf3322f8b78b2db8a161.jpg": "img_19.jpg",
32
+ "c6c4dfd8d7d1b83ef05d0ad30d4d09e75fe1d1152099b976eef7aededb872873.jpg": "img_20.jpg",
33
+ "7eaeb5261341b3dbe0554989b2681f87c4b7a418e21445f3e88aa873e16db0df.jpg": "img_21.jpg",
34
+ "22cbebb54b25ccf620ab043fc977fcc709fd5692d1e74b02267b8f689284225d.jpg": "img_22.jpg",
35
+ "7a3f07a668cfc19e26c35fb1421908638d5a233723942301eda2764a1e81374d.jpg": "img_23.jpg",
36
+ "42b9e068a3fddcc2adaa6736e0ccee448c0302349547c8eaed8a07c870d29b17.jpg": "img_24.jpg",
37
+ "2efcd74e6c9447686d3e08d2dca6998ffd44f5cf0323d7d93b4213a2337b32ab.jpg": "img_25.jpg",
38
+ "6ba16781c7909a8a47a6a51e520e739320c22791147ad6bbd482473cf5c96717.jpg": "img_26.jpg",
39
+ "3d3cdfbca59671749e9d93714510a36441a10769f6b43720f9f3e733d893ea3a.jpg": "img_27.jpg",
40
+ "35394d307566e17440ab0322a3c915a4537db1db85628b38f2fe7827d19d719d.jpg": "img_28.jpg"
41
+ },
42
+ "tables_extracted": {
43
+ "img_1.jpg": [
44
+ "img_1.jpg_rows/row_0/col_0.png",
45
+ "img_1.jpg_rows/row_0/col_1.png",
46
+ "img_1.jpg_rows/row_1/col_0.png",
47
+ "img_1.jpg_rows/row_1/col_1.png"
48
+ ],
49
+ "img_2.jpg": [
50
+ "img_2.jpg_rows/row_0/col_0.png",
51
+ "img_2.jpg_rows/row_0/col_1.png",
52
+ "img_2.jpg_rows/row_1/col_0.png",
53
+ "img_2.jpg_rows/row_2/col_0.png",
54
+ "img_2.jpg_rows/row_3/col_0.png"
55
+ ],
56
+ "img_3.jpg": [
57
+ "img_3.jpg_rows/row_0/col_0.png",
58
+ "img_3.jpg_rows/row_0/col_1.png",
59
+ "img_3.jpg_rows/row_1/col_0.png"
60
+ ],
61
+ "img_4.jpg": [
62
+ "img_4.jpg_rows/row_0/col_0.png",
63
+ "img_4.jpg_rows/row_0/col_1.png",
64
+ "img_4.jpg_rows/row_1/col_0.png",
65
+ "img_4.jpg_rows/row_1/col_1.png"
66
+ ],
67
+ "img_5.jpg": [
68
+ "img_5.jpg_rows/row_0/col_0.png",
69
+ "img_5.jpg_rows/row_0/col_1.png",
70
+ "img_5.jpg_rows/row_1/col_0.png",
71
+ "img_5.jpg_rows/row_1/col_1.png",
72
+ "img_5.jpg_rows/row_2/col_0.png"
73
+ ],
74
+ "img_6.jpg": [
75
+ "img_6.jpg_rows/row_0/col_0.png",
76
+ "img_6.jpg_rows/row_0/col_1.png",
77
+ "img_6.jpg_rows/row_1/col_0.png",
78
+ "img_6.jpg_rows/row_1/col_1.png"
79
+ ],
80
+ "img_7.jpg": [
81
+ "img_7.jpg_rows/row_0/col_0.png",
82
+ "img_7.jpg_rows/row_0/col_1.png",
83
+ "img_7.jpg_rows/row_1/col_0.png",
84
+ "img_7.jpg_rows/row_2/col_0.png",
85
+ "img_7.jpg_rows/row_2/col_1.png"
86
+ ],
87
+ "img_8.jpg": [
88
+ "img_8.jpg_rows/row_0/col_0.png",
89
+ "img_8.jpg_rows/row_0/col_1.png",
90
+ "img_8.jpg_rows/row_0/col_2.png",
91
+ "img_8.jpg_rows/row_1/col_0.png",
92
+ "img_8.jpg_rows/row_1/col_1.png",
93
+ "img_8.jpg_rows/row_1/col_2.png",
94
+ "img_8.jpg_rows/row_2/col_0.png",
95
+ "img_8.jpg_rows/row_2/col_1.png",
96
+ "img_8.jpg_rows/row_3/col_0.png",
97
+ "img_8.jpg_rows/row_3/col_1.png",
98
+ "img_8.jpg_rows/row_4/col_0.png",
99
+ "img_8.jpg_rows/row_4/col_1.png",
100
+ "img_8.jpg_rows/row_5/col_0.png",
101
+ "img_8.jpg_rows/row_5/col_1.png"
102
+ ],
103
+ "img_9.jpg": [
104
+ "img_9.jpg_rows/row_0/col_0.png",
105
+ "img_9.jpg_rows/row_0/col_1.png",
106
+ "img_9.jpg_rows/row_0/col_2.png",
107
+ "img_9.jpg_rows/row_1/col_0.png",
108
+ "img_9.jpg_rows/row_1/col_1.png",
109
+ "img_9.jpg_rows/row_2/col_0.png",
110
+ "img_9.jpg_rows/row_2/col_1.png",
111
+ "img_9.jpg_rows/row_3/col_0.png",
112
+ "img_9.jpg_rows/row_3/col_1.png"
113
+ ],
114
+ "img_10.jpg": [
115
+ "img_10.jpg_rows/row_0/col_0.png",
116
+ "img_10.jpg_rows/row_0/col_1.png",
117
+ "img_10.jpg_rows/row_1/col_0.png",
118
+ "img_10.jpg_rows/row_2/col_0.png",
119
+ "img_10.jpg_rows/row_3/col_0.png"
120
+ ],
121
+ "img_11.jpg": [
122
+ "img_11.jpg_rows/row_0/col_0.png",
123
+ "img_11.jpg_rows/row_1/col_0.png",
124
+ "img_11.jpg_rows/row_2/col_0.png",
125
+ "img_11.jpg_rows/row_3/col_0.png",
126
+ "img_11.jpg_rows/row_4/col_0.png",
127
+ "img_11.jpg_rows/row_5/col_0.png"
128
+ ],
129
+ "img_12.jpg": [
130
+ "img_12.jpg_rows/row_0/col_0.png",
131
+ "img_12.jpg_rows/row_0/col_1.png",
132
+ "img_12.jpg_rows/row_1/col_0.png",
133
+ "img_12.jpg_rows/row_1/col_1.png",
134
+ "img_12.jpg_rows/row_2/col_0.png",
135
+ "img_12.jpg_rows/row_2/col_1.png"
136
+ ],
137
+ "img_13.jpg": [
138
+ "img_13.jpg_rows/row_0/col_0.png",
139
+ "img_13.jpg_rows/row_0/col_1.png",
140
+ "img_13.jpg_rows/row_1/col_0.png",
141
+ "img_13.jpg_rows/row_1/col_1.png",
142
+ "img_13.jpg_rows/row_2/col_0.png",
143
+ "img_13.jpg_rows/row_3/col_0.png"
144
+ ],
145
+ "img_14.jpg": [
146
+ "img_14.jpg_rows/row_0/col_0.png",
147
+ "img_14.jpg_rows/row_0/col_1.png",
148
+ "img_14.jpg_rows/row_1/col_0.png",
149
+ "img_14.jpg_rows/row_1/col_1.png",
150
+ "img_14.jpg_rows/row_2/col_0.png",
151
+ "img_14.jpg_rows/row_3/col_0.png",
152
+ "img_14.jpg_rows/row_4/col_0.png",
153
+ "img_14.jpg_rows/row_4/col_1.png",
154
+ "img_14.jpg_rows/row_5/col_0.png"
155
+ ],
156
+ "img_15.jpg": [
157
+ "img_15.jpg_rows/row_0/col_0.png",
158
+ "img_15.jpg_rows/row_0/col_1.png",
159
+ "img_15.jpg_rows/row_1/col_0.png",
160
+ "img_15.jpg_rows/row_1/col_1.png",
161
+ "img_15.jpg_rows/row_2/col_0.png",
162
+ "img_15.jpg_rows/row_3/col_0.png",
163
+ "img_15.jpg_rows/row_4/col_0.png"
164
+ ],
165
+ "img_16.jpg": [
166
+ "img_16.jpg_rows/row_0/col_0.png",
167
+ "img_16.jpg_rows/row_0/col_1.png",
168
+ "img_16.jpg_rows/row_1/col_0.png",
169
+ "img_16.jpg_rows/row_1/col_1.png",
170
+ "img_16.jpg_rows/row_2/col_0.png",
171
+ "img_16.jpg_rows/row_3/col_0.png",
172
+ "img_16.jpg_rows/row_3/col_1.png",
173
+ "img_16.jpg_rows/row_4/col_0.png",
174
+ "img_16.jpg_rows/row_5/col_0.png"
175
+ ],
176
+ "img_17.jpg": [
177
+ "img_17.jpg_rows/row_0/col_0.png",
178
+ "img_17.jpg_rows/row_0/col_1.png",
179
+ "img_17.jpg_rows/row_1/col_0.png",
180
+ "img_17.jpg_rows/row_2/col_0.png",
181
+ "img_17.jpg_rows/row_2/col_1.png",
182
+ "img_17.jpg_rows/row_3/col_0.png",
183
+ "img_17.jpg_rows/row_4/col_0.png",
184
+ "img_17.jpg_rows/row_5/col_0.png"
185
+ ],
186
+ "img_18.jpg": [
187
+ "img_18.jpg_rows/row_0/col_0.png",
188
+ "img_18.jpg_rows/row_0/col_1.png",
189
+ "img_18.jpg_rows/row_1/col_0.png",
190
+ "img_18.jpg_rows/row_1/col_1.png"
191
+ ],
192
+ "img_19.jpg": [
193
+ "img_19.jpg_rows/row_0/col_0.png",
194
+ "img_19.jpg_rows/row_0/col_1.png",
195
+ "img_19.jpg_rows/row_1/col_0.png",
196
+ "img_19.jpg_rows/row_1/col_1.png",
197
+ "img_19.jpg_rows/row_2/col_0.png",
198
+ "img_19.jpg_rows/row_2/col_1.png"
199
+ ],
200
+ "img_20.jpg": [
201
+ "img_20.jpg_rows/row_0/col_0.png",
202
+ "img_20.jpg_rows/row_0/col_1.png",
203
+ "img_20.jpg_rows/row_1/col_0.png",
204
+ "img_20.jpg_rows/row_1/col_1.png"
205
+ ],
206
+ "img_21.jpg": [
207
+ "img_21.jpg_rows/row_0/col_0.png",
208
+ "img_21.jpg_rows/row_0/col_1.png",
209
+ "img_21.jpg_rows/row_1/col_0.png",
210
+ "img_21.jpg_rows/row_1/col_1.png"
211
+ ],
212
+ "img_22.jpg": [
213
+ "img_22.jpg_rows/row_0/col_0.png",
214
+ "img_22.jpg_rows/row_0/col_1.png",
215
+ "img_22.jpg_rows/row_1/col_0.png",
216
+ "img_22.jpg_rows/row_1/col_1.png",
217
+ "img_22.jpg_rows/row_2/col_0.png",
218
+ "img_22.jpg_rows/row_2/col_1.png",
219
+ "img_22.jpg_rows/row_3/col_0.png"
220
+ ],
221
+ "img_23.jpg": [
222
+ "img_23.jpg_rows/row_0/col_0.png",
223
+ "img_23.jpg_rows/row_0/col_1.png",
224
+ "img_23.jpg_rows/row_1/col_0.png",
225
+ "img_23.jpg_rows/row_1/col_1.png"
226
+ ],
227
+ "img_24.jpg": [
228
+ "img_24.jpg_rows/row_0/col_0.png",
229
+ "img_24.jpg_rows/row_0/col_1.png",
230
+ "img_24.jpg_rows/row_1/col_0.png",
231
+ "img_24.jpg_rows/row_1/col_1.png",
232
+ "img_24.jpg_rows/row_2/col_0.png"
233
+ ],
234
+ "img_25.jpg": [
235
+ "img_25.jpg_rows/row_0/col_0.png",
236
+ "img_25.jpg_rows/row_1/col_0.png",
237
+ "img_25.jpg_rows/row_2/col_0.png",
238
+ "img_25.jpg_rows/row_3/col_0.png",
239
+ "img_25.jpg_rows/row_4/col_0.png",
240
+ "img_25.jpg_rows/row_5/col_0.png"
241
+ ],
242
+ "img_26.jpg": [
243
+ "img_26.jpg_rows/row_0/col_0.png",
244
+ "img_26.jpg_rows/row_0/col_1.png",
245
+ "img_26.jpg_rows/row_1/col_0.png",
246
+ "img_26.jpg_rows/row_2/col_0.png"
247
+ ],
248
+ "img_27.jpg": [
249
+ "img_27.jpg_rows/row_0/col_0.png",
250
+ "img_27.jpg_rows/row_0/col_1.png",
251
+ "img_27.jpg_rows/row_1/col_0.png",
252
+ "img_27.jpg_rows/row_1/col_1.png",
253
+ "img_27.jpg_rows/row_2/col_0.png",
254
+ "img_27.jpg_rows/row_3/col_0.png",
255
+ "img_27.jpg_rows/row_4/col_0.png",
256
+ "img_27.jpg_rows/row_4/col_1.png"
257
+ ],
258
+ "img_28.jpg": [
259
+ "img_28.jpg_rows/row_0/col_0.png",
260
+ "img_28.jpg_rows/row_1/col_0.png",
261
+ "img_28.jpg_rows/row_2/col_0.png",
262
+ "img_28.jpg_rows/row_3/col_0.png"
263
+ ]
264
+ }
265
+ }
wje/final_output_local.json ADDED
@@ -0,0 +1,265 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "subtopics": {
3
+ "Paper 1 and Paper 2: Pure Mathematics": [
4
+ 11,
5
+ 29
6
+ ],
7
+ "Paper 3: Statistics and Mechanics": [
8
+ 30,
9
+ 40
10
+ ]
11
+ },
12
+ "local_images": {
13
+ "e7e5c8f3c0a6316c2b50698c45ebe05b49bfd8bbe47a07b7b1929dd3cfd3e609.jpg": "img_1.jpg",
14
+ "b243ef738ec2465b1cc00f4dd8dd0e5f5e10a91debf7762903ac6c023dd238c4.jpg": "img_2.jpg",
15
+ "5e22a8a8c5bc23ee4d16bda9cce4a6ab4bb53854074fd4d691531d5adb9f3ebe.jpg": "img_3.jpg",
16
+ "0e70645e72eadab75c88846b7947fc1216cf31d325febf02fbdf4898b430465d.jpg": "img_4.jpg",
17
+ "52484f429af5d74ef75e96bf132b15fdc4acd2ed46accb981d670592dcc57ff2.jpg": "img_5.jpg",
18
+ "5a153708e7a128d8f6477cb294d2f902d3a9bc57af709c81ccc3937b96580137.jpg": "img_6.jpg",
19
+ "fd3a52607bef204e6998e09db82d195de76d929399c2cb1a63e26f87054eec6f.jpg": "img_7.jpg",
20
+ "93885318f77c148b9fb1cd162cb9938d6f6cd795d000d5b997f2297198462fcf.jpg": "img_8.jpg",
21
+ "133a659582f49fb71dc5fcae918278e6659a257026e35741ba8e6b94fcdb9de6.jpg": "img_9.jpg",
22
+ "860d10a56a7e892c674f74fd030592339e629fb80d6e6dbfc343f95ec65a7c16.jpg": "img_10.jpg",
23
+ "ae5ee4479ae736ff433ca9b2a1c3f753bbc8cc11a384e27cb710b426757c31e9.jpg": "img_11.jpg",
24
+ "39ac9ccc8cd681e552fb1ae08341b4a2dcb33ea8fe6c787daf99fb993d29e57f.jpg": "img_12.jpg",
25
+ "6d67beb5c0bf2168a87ad6b7c179ff9c7de8bbd7e720f77f7bf206080cceb589.jpg": "img_13.jpg",
26
+ "b89d31200bc06fda181bd2538b5f3274de3e52b0adc7dd023ca676e168e6d487.jpg": "img_14.jpg",
27
+ "78907967ba7a56221a0987e6e696e361c82fcf057f41659e4aa77943a62b6763.jpg": "img_15.jpg",
28
+ "bd3eb31469dd7b72e9773564915dc768e2e152878d887dcab34e83875e0625bb.jpg": "img_16.jpg",
29
+ "f1f1acb21df3d785fa3120fbae5fc74f7064769d9b38524bb991cfaa110177f6.jpg": "img_17.jpg",
30
+ "b8b803d008ec9053c40f4a9c2c265a8a0b15742059331dc7997336c94ab74dc4.jpg": "img_18.jpg",
31
+ "9cbb4e3b89d75d1d5da2fe8c6ccc4c1d3f612779abaccf3322f8b78b2db8a161.jpg": "img_19.jpg",
32
+ "c6c4dfd8d7d1b83ef05d0ad30d4d09e75fe1d1152099b976eef7aededb872873.jpg": "img_20.jpg",
33
+ "7eaeb5261341b3dbe0554989b2681f87c4b7a418e21445f3e88aa873e16db0df.jpg": "img_21.jpg",
34
+ "22cbebb54b25ccf620ab043fc977fcc709fd5692d1e74b02267b8f689284225d.jpg": "img_22.jpg",
35
+ "7a3f07a668cfc19e26c35fb1421908638d5a233723942301eda2764a1e81374d.jpg": "img_23.jpg",
36
+ "42b9e068a3fddcc2adaa6736e0ccee448c0302349547c8eaed8a07c870d29b17.jpg": "img_24.jpg",
37
+ "2efcd74e6c9447686d3e08d2dca6998ffd44f5cf0323d7d93b4213a2337b32ab.jpg": "img_25.jpg",
38
+ "6ba16781c7909a8a47a6a51e520e739320c22791147ad6bbd482473cf5c96717.jpg": "img_26.jpg",
39
+ "3d3cdfbca59671749e9d93714510a36441a10769f6b43720f9f3e733d893ea3a.jpg": "img_27.jpg",
40
+ "35394d307566e17440ab0322a3c915a4537db1db85628b38f2fe7827d19d719d.jpg": "img_28.jpg"
41
+ },
42
+ "tables_extracted": {
43
+ "img_1.jpg": [
44
+ "img_1.jpg_rows/row_0/col_0.png",
45
+ "img_1.jpg_rows/row_0/col_1.png",
46
+ "img_1.jpg_rows/row_1/col_0.png",
47
+ "img_1.jpg_rows/row_1/col_1.png"
48
+ ],
49
+ "img_2.jpg": [
50
+ "img_2.jpg_rows/row_0/col_0.png",
51
+ "img_2.jpg_rows/row_0/col_1.png",
52
+ "img_2.jpg_rows/row_1/col_0.png",
53
+ "img_2.jpg_rows/row_2/col_0.png",
54
+ "img_2.jpg_rows/row_3/col_0.png"
55
+ ],
56
+ "img_3.jpg": [
57
+ "img_3.jpg_rows/row_0/col_0.png",
58
+ "img_3.jpg_rows/row_0/col_1.png",
59
+ "img_3.jpg_rows/row_1/col_0.png"
60
+ ],
61
+ "img_4.jpg": [
62
+ "img_4.jpg_rows/row_0/col_0.png",
63
+ "img_4.jpg_rows/row_0/col_1.png",
64
+ "img_4.jpg_rows/row_1/col_0.png",
65
+ "img_4.jpg_rows/row_1/col_1.png"
66
+ ],
67
+ "img_5.jpg": [
68
+ "img_5.jpg_rows/row_0/col_0.png",
69
+ "img_5.jpg_rows/row_0/col_1.png",
70
+ "img_5.jpg_rows/row_1/col_0.png",
71
+ "img_5.jpg_rows/row_1/col_1.png",
72
+ "img_5.jpg_rows/row_2/col_0.png"
73
+ ],
74
+ "img_6.jpg": [
75
+ "img_6.jpg_rows/row_0/col_0.png",
76
+ "img_6.jpg_rows/row_0/col_1.png",
77
+ "img_6.jpg_rows/row_1/col_0.png",
78
+ "img_6.jpg_rows/row_1/col_1.png"
79
+ ],
80
+ "img_7.jpg": [
81
+ "img_7.jpg_rows/row_0/col_0.png",
82
+ "img_7.jpg_rows/row_0/col_1.png",
83
+ "img_7.jpg_rows/row_1/col_0.png",
84
+ "img_7.jpg_rows/row_2/col_0.png",
85
+ "img_7.jpg_rows/row_2/col_1.png"
86
+ ],
87
+ "img_8.jpg": [
88
+ "img_8.jpg_rows/row_0/col_0.png",
89
+ "img_8.jpg_rows/row_0/col_1.png",
90
+ "img_8.jpg_rows/row_0/col_2.png",
91
+ "img_8.jpg_rows/row_1/col_0.png",
92
+ "img_8.jpg_rows/row_1/col_1.png",
93
+ "img_8.jpg_rows/row_1/col_2.png",
94
+ "img_8.jpg_rows/row_2/col_0.png",
95
+ "img_8.jpg_rows/row_2/col_1.png",
96
+ "img_8.jpg_rows/row_3/col_0.png",
97
+ "img_8.jpg_rows/row_3/col_1.png",
98
+ "img_8.jpg_rows/row_4/col_0.png",
99
+ "img_8.jpg_rows/row_4/col_1.png",
100
+ "img_8.jpg_rows/row_5/col_0.png",
101
+ "img_8.jpg_rows/row_5/col_1.png"
102
+ ],
103
+ "img_9.jpg": [
104
+ "img_9.jpg_rows/row_0/col_0.png",
105
+ "img_9.jpg_rows/row_0/col_1.png",
106
+ "img_9.jpg_rows/row_0/col_2.png",
107
+ "img_9.jpg_rows/row_1/col_0.png",
108
+ "img_9.jpg_rows/row_1/col_1.png",
109
+ "img_9.jpg_rows/row_2/col_0.png",
110
+ "img_9.jpg_rows/row_2/col_1.png",
111
+ "img_9.jpg_rows/row_3/col_0.png",
112
+ "img_9.jpg_rows/row_3/col_1.png"
113
+ ],
114
+ "img_10.jpg": [
115
+ "img_10.jpg_rows/row_0/col_0.png",
116
+ "img_10.jpg_rows/row_0/col_1.png",
117
+ "img_10.jpg_rows/row_1/col_0.png",
118
+ "img_10.jpg_rows/row_2/col_0.png",
119
+ "img_10.jpg_rows/row_3/col_0.png"
120
+ ],
121
+ "img_11.jpg": [
122
+ "img_11.jpg_rows/row_0/col_0.png",
123
+ "img_11.jpg_rows/row_1/col_0.png",
124
+ "img_11.jpg_rows/row_2/col_0.png",
125
+ "img_11.jpg_rows/row_3/col_0.png",
126
+ "img_11.jpg_rows/row_4/col_0.png",
127
+ "img_11.jpg_rows/row_5/col_0.png"
128
+ ],
129
+ "img_12.jpg": [
130
+ "img_12.jpg_rows/row_0/col_0.png",
131
+ "img_12.jpg_rows/row_0/col_1.png",
132
+ "img_12.jpg_rows/row_1/col_0.png",
133
+ "img_12.jpg_rows/row_1/col_1.png",
134
+ "img_12.jpg_rows/row_2/col_0.png",
135
+ "img_12.jpg_rows/row_2/col_1.png"
136
+ ],
137
+ "img_13.jpg": [
138
+ "img_13.jpg_rows/row_0/col_0.png",
139
+ "img_13.jpg_rows/row_0/col_1.png",
140
+ "img_13.jpg_rows/row_1/col_0.png",
141
+ "img_13.jpg_rows/row_1/col_1.png",
142
+ "img_13.jpg_rows/row_2/col_0.png",
143
+ "img_13.jpg_rows/row_3/col_0.png"
144
+ ],
145
+ "img_14.jpg": [
146
+ "img_14.jpg_rows/row_0/col_0.png",
147
+ "img_14.jpg_rows/row_0/col_1.png",
148
+ "img_14.jpg_rows/row_1/col_0.png",
149
+ "img_14.jpg_rows/row_1/col_1.png",
150
+ "img_14.jpg_rows/row_2/col_0.png",
151
+ "img_14.jpg_rows/row_3/col_0.png",
152
+ "img_14.jpg_rows/row_4/col_0.png",
153
+ "img_14.jpg_rows/row_4/col_1.png",
154
+ "img_14.jpg_rows/row_5/col_0.png"
155
+ ],
156
+ "img_15.jpg": [
157
+ "img_15.jpg_rows/row_0/col_0.png",
158
+ "img_15.jpg_rows/row_0/col_1.png",
159
+ "img_15.jpg_rows/row_1/col_0.png",
160
+ "img_15.jpg_rows/row_1/col_1.png",
161
+ "img_15.jpg_rows/row_2/col_0.png",
162
+ "img_15.jpg_rows/row_3/col_0.png",
163
+ "img_15.jpg_rows/row_4/col_0.png"
164
+ ],
165
+ "img_16.jpg": [
166
+ "img_16.jpg_rows/row_0/col_0.png",
167
+ "img_16.jpg_rows/row_0/col_1.png",
168
+ "img_16.jpg_rows/row_1/col_0.png",
169
+ "img_16.jpg_rows/row_1/col_1.png",
170
+ "img_16.jpg_rows/row_2/col_0.png",
171
+ "img_16.jpg_rows/row_3/col_0.png",
172
+ "img_16.jpg_rows/row_3/col_1.png",
173
+ "img_16.jpg_rows/row_4/col_0.png",
174
+ "img_16.jpg_rows/row_5/col_0.png"
175
+ ],
176
+ "img_17.jpg": [
177
+ "img_17.jpg_rows/row_0/col_0.png",
178
+ "img_17.jpg_rows/row_0/col_1.png",
179
+ "img_17.jpg_rows/row_1/col_0.png",
180
+ "img_17.jpg_rows/row_2/col_0.png",
181
+ "img_17.jpg_rows/row_2/col_1.png",
182
+ "img_17.jpg_rows/row_3/col_0.png",
183
+ "img_17.jpg_rows/row_4/col_0.png",
184
+ "img_17.jpg_rows/row_5/col_0.png"
185
+ ],
186
+ "img_18.jpg": [
187
+ "img_18.jpg_rows/row_0/col_0.png",
188
+ "img_18.jpg_rows/row_0/col_1.png",
189
+ "img_18.jpg_rows/row_1/col_0.png",
190
+ "img_18.jpg_rows/row_1/col_1.png"
191
+ ],
192
+ "img_19.jpg": [
193
+ "img_19.jpg_rows/row_0/col_0.png",
194
+ "img_19.jpg_rows/row_0/col_1.png",
195
+ "img_19.jpg_rows/row_1/col_0.png",
196
+ "img_19.jpg_rows/row_1/col_1.png",
197
+ "img_19.jpg_rows/row_2/col_0.png",
198
+ "img_19.jpg_rows/row_2/col_1.png"
199
+ ],
200
+ "img_20.jpg": [
201
+ "img_20.jpg_rows/row_0/col_0.png",
202
+ "img_20.jpg_rows/row_0/col_1.png",
203
+ "img_20.jpg_rows/row_1/col_0.png",
204
+ "img_20.jpg_rows/row_1/col_1.png"
205
+ ],
206
+ "img_21.jpg": [
207
+ "img_21.jpg_rows/row_0/col_0.png",
208
+ "img_21.jpg_rows/row_0/col_1.png",
209
+ "img_21.jpg_rows/row_1/col_0.png",
210
+ "img_21.jpg_rows/row_1/col_1.png"
211
+ ],
212
+ "img_22.jpg": [
213
+ "img_22.jpg_rows/row_0/col_0.png",
214
+ "img_22.jpg_rows/row_0/col_1.png",
215
+ "img_22.jpg_rows/row_1/col_0.png",
216
+ "img_22.jpg_rows/row_1/col_1.png",
217
+ "img_22.jpg_rows/row_2/col_0.png",
218
+ "img_22.jpg_rows/row_2/col_1.png",
219
+ "img_22.jpg_rows/row_3/col_0.png"
220
+ ],
221
+ "img_23.jpg": [
222
+ "img_23.jpg_rows/row_0/col_0.png",
223
+ "img_23.jpg_rows/row_0/col_1.png",
224
+ "img_23.jpg_rows/row_1/col_0.png",
225
+ "img_23.jpg_rows/row_1/col_1.png"
226
+ ],
227
+ "img_24.jpg": [
228
+ "img_24.jpg_rows/row_0/col_0.png",
229
+ "img_24.jpg_rows/row_0/col_1.png",
230
+ "img_24.jpg_rows/row_1/col_0.png",
231
+ "img_24.jpg_rows/row_1/col_1.png",
232
+ "img_24.jpg_rows/row_2/col_0.png"
233
+ ],
234
+ "img_25.jpg": [
235
+ "img_25.jpg_rows/row_0/col_0.png",
236
+ "img_25.jpg_rows/row_1/col_0.png",
237
+ "img_25.jpg_rows/row_2/col_0.png",
238
+ "img_25.jpg_rows/row_3/col_0.png",
239
+ "img_25.jpg_rows/row_4/col_0.png",
240
+ "img_25.jpg_rows/row_5/col_0.png"
241
+ ],
242
+ "img_26.jpg": [
243
+ "img_26.jpg_rows/row_0/col_0.png",
244
+ "img_26.jpg_rows/row_0/col_1.png",
245
+ "img_26.jpg_rows/row_1/col_0.png",
246
+ "img_26.jpg_rows/row_2/col_0.png"
247
+ ],
248
+ "img_27.jpg": [
249
+ "img_27.jpg_rows/row_0/col_0.png",
250
+ "img_27.jpg_rows/row_0/col_1.png",
251
+ "img_27.jpg_rows/row_1/col_0.png",
252
+ "img_27.jpg_rows/row_1/col_1.png",
253
+ "img_27.jpg_rows/row_2/col_0.png",
254
+ "img_27.jpg_rows/row_3/col_0.png",
255
+ "img_27.jpg_rows/row_4/col_0.png",
256
+ "img_27.jpg_rows/row_4/col_1.png"
257
+ ],
258
+ "img_28.jpg": [
259
+ "img_28.jpg_rows/row_0/col_0.png",
260
+ "img_28.jpg_rows/row_1/col_0.png",
261
+ "img_28.jpg_rows/row_2/col_0.png",
262
+ "img_28.jpg_rows/row_3/col_0.png"
263
+ ]
264
+ }
265
+ }
wje/img_1.jpg_rows/row_0/col_0.png ADDED
wje/img_1.jpg_rows/row_0/col_1.png ADDED
wje/img_1.jpg_rows/row_1/col_0.png ADDED
wje/img_1.jpg_rows/row_1/col_1.png ADDED
wje/img_10.jpg_rows/row_0/col_0.png ADDED
wje/img_10.jpg_rows/row_0/col_1.png ADDED
wje/img_10.jpg_rows/row_1/col_0.png ADDED
wje/img_10.jpg_rows/row_2/col_0.png ADDED
wje/img_10.jpg_rows/row_3/col_0.png ADDED
wje/img_11.jpg_rows/row_0/col_0.png ADDED
wje/img_11.jpg_rows/row_1/col_0.png ADDED
wje/img_11.jpg_rows/row_2/col_0.png ADDED
wje/img_11.jpg_rows/row_3/col_0.png ADDED
wje/img_11.jpg_rows/row_4/col_0.png ADDED
wje/img_11.jpg_rows/row_5/col_0.png ADDED
wje/img_12.jpg_rows/row_0/col_0.png ADDED
wje/img_12.jpg_rows/row_0/col_1.png ADDED
wje/img_12.jpg_rows/row_1/col_0.png ADDED
wje/img_12.jpg_rows/row_1/col_1.png ADDED
wje/img_12.jpg_rows/row_2/col_0.png ADDED
wje/img_12.jpg_rows/row_2/col_1.png ADDED
wje/img_13.jpg_rows/row_0/col_0.png ADDED
wje/img_13.jpg_rows/row_0/col_1.png ADDED
wje/img_13.jpg_rows/row_1/col_0.png ADDED
wje/img_13.jpg_rows/row_1/col_1.png ADDED
wje/img_13.jpg_rows/row_2/col_0.png ADDED
wje/img_13.jpg_rows/row_3/col_0.png ADDED
wje/img_14.jpg_rows/row_0/col_0.png ADDED
wje/img_14.jpg_rows/row_0/col_1.png ADDED
wje/img_14.jpg_rows/row_1/col_0.png ADDED
wje/img_14.jpg_rows/row_1/col_1.png ADDED
wje/img_14.jpg_rows/row_2/col_0.png ADDED
wje/img_14.jpg_rows/row_3/col_0.png ADDED
wje/img_14.jpg_rows/row_4/col_0.png ADDED
wje/img_14.jpg_rows/row_4/col_1.png ADDED
wje/img_14.jpg_rows/row_5/col_0.png ADDED
wje/img_15.jpg_rows/row_0/col_0.png ADDED
wje/img_15.jpg_rows/row_0/col_1.png ADDED
wje/img_15.jpg_rows/row_1/col_0.png ADDED