json saving
Browse files- topic_extraction.log +641 -0
- topic_extraction.py +72 -79
topic_extraction.log
CHANGED
|
@@ -87,3 +87,644 @@
|
|
| 87 |
2025-02-28 15:36:37,389 [WARNING] __main__ - No suitable occurrence for '2.4 A2 Unit 4'. Using page 3.
|
| 88 |
2025-02-28 15:36:37,390 [INFO] __main__ - Processing pages (0-based): [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34]
|
| 89 |
2025-02-28 15:36:38,518 [INFO] __main__ - GPU memory cleaned up.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
2025-02-28 15:36:37,389 [WARNING] __main__ - No suitable occurrence for '2.4 A2 Unit 4'. Using page 3.
|
| 88 |
2025-02-28 15:36:37,390 [INFO] __main__ - Processing pages (0-based): [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34]
|
| 89 |
2025-02-28 15:36:38,518 [INFO] __main__ - GPU memory cleaned up.
|
| 90 |
+
2025-03-02 14:46:32,526 [INFO] __main__ - Processing PDF: /home/user/app/input_output/ocr-specification-economics.pdf
|
| 91 |
+
2025-03-02 14:46:33,659 [INFO] __main__ - Gemini returned subtopics: {'Content of A Level in Economics (H460)': [5, 5], 'Content of Component 1: Microeconomics (H460/01)': [6, 16], 'Content of Component 2: Macroeconomics (H460/02)': [17, 27], 'Content of Component 3: Themes in economics (H460/03)': [28, 28]}
|
| 92 |
+
2025-03-02 14:46:33,667 [INFO] __main__ - Loaded 9752567 bytes from local file '/home/user/app/input_output/ocr-specification-economics.pdf'
|
| 93 |
+
2025-03-02 14:46:33,882 [INFO] __main__ - Occurrences of subtopic 'Content of A Level in Economics (H460)': [2, 10]
|
| 94 |
+
2025-03-02 14:46:34,025 [INFO] __main__ - Occurrences of subtopic 'Content of Component 1: Microeconomics (H460/01)': [2, 11]
|
| 95 |
+
2025-03-02 14:46:34,172 [INFO] __main__ - Occurrences of subtopic 'Content of Component 2: Macroeconomics (H460/02)': [2, 22]
|
| 96 |
+
2025-03-02 14:46:34,318 [INFO] __main__ - Occurrences of subtopic 'Content of Component 3: Themes in economics (H460/03)': [2, 33]
|
| 97 |
+
2025-03-02 14:46:34,319 [INFO] __main__ - Processing pages (0-based): [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33]
|
| 98 |
+
2025-03-02 14:49:00,397 [INFO] __main__ - GPU memory cleaned up.
|
| 99 |
+
2025-03-02 14:49:46,666 [INFO] __main__ - Processing PDF: /home/user/app/input_output/ocr-specification-economics.pdf
|
| 100 |
+
2025-03-02 14:49:47,756 [INFO] __main__ - Gemini returned subtopics: {'Content of A Level in Economics (H460)': [5, 5], 'Content of Component 1: Microeconomics (H460/01)': [6, 16], 'Content of Component 2: Macroeconomics (H460/02)': [17, 27], 'Content of Component 3: Themes in economics (H460/03)': [28, 28]}
|
| 101 |
+
2025-03-02 14:49:47,762 [INFO] __main__ - Loaded 9752567 bytes from local file '/home/user/app/input_output/ocr-specification-economics.pdf'
|
| 102 |
+
2025-03-02 14:49:47,909 [INFO] __main__ - Occurrences of subtopic 'Content of A Level in Economics (H460)': [2, 10]
|
| 103 |
+
2025-03-02 14:49:48,062 [INFO] __main__ - Occurrences of subtopic 'Content of Component 1: Microeconomics (H460/01)': [2, 11]
|
| 104 |
+
2025-03-02 14:49:48,290 [INFO] __main__ - Occurrences of subtopic 'Content of Component 2: Macroeconomics (H460/02)': [2, 22]
|
| 105 |
+
2025-03-02 14:49:48,475 [INFO] __main__ - Occurrences of subtopic 'Content of Component 3: Themes in economics (H460/03)': [2, 33]
|
| 106 |
+
2025-03-02 14:49:48,475 [INFO] __main__ - Processing pages (0-based): [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33]
|
| 107 |
+
2025-03-02 14:50:20,455 [INFO] __main__ - doc_analyze complete. Extracting images.
|
| 108 |
+
2025-03-02 14:50:23,117 [INFO] __main__ - Uploaded to S3: topic_extraction/img_1.jpg
|
| 109 |
+
2025-03-02 14:50:23,774 [INFO] __main__ - Uploaded to S3: topic_extraction/img_2.jpg
|
| 110 |
+
2025-03-02 14:50:24,317 [INFO] __main__ - Uploaded to S3: topic_extraction/img_3.jpg
|
| 111 |
+
2025-03-02 14:50:24,869 [INFO] __main__ - Uploaded to S3: topic_extraction/img_4.jpg
|
| 112 |
+
2025-03-02 14:50:25,434 [INFO] __main__ - Uploaded to S3: topic_extraction/img_5.jpg
|
| 113 |
+
2025-03-02 14:50:26,016 [INFO] __main__ - Uploaded to S3: topic_extraction/img_6.jpg
|
| 114 |
+
2025-03-02 14:50:26,595 [INFO] __main__ - Uploaded to S3: topic_extraction/img_7.jpg
|
| 115 |
+
2025-03-02 14:50:27,181 [INFO] __main__ - Uploaded to S3: topic_extraction/img_8.jpg
|
| 116 |
+
2025-03-02 14:50:27,789 [INFO] __main__ - Uploaded to S3: topic_extraction/img_9.jpg
|
| 117 |
+
2025-03-02 14:50:28,428 [INFO] __main__ - Uploaded to S3: topic_extraction/img_10.jpg
|
| 118 |
+
2025-03-02 14:50:28,805 [INFO] __main__ - Uploaded to S3: topic_extraction/img_11.jpg
|
| 119 |
+
2025-03-02 14:50:29,410 [INFO] __main__ - Uploaded to S3: topic_extraction/img_12.jpg
|
| 120 |
+
2025-03-02 14:50:30,093 [INFO] __main__ - Uploaded to S3: topic_extraction/img_13.jpg
|
| 121 |
+
2025-03-02 14:50:30,713 [INFO] __main__ - Uploaded to S3: topic_extraction/img_14.jpg
|
| 122 |
+
2025-03-02 14:50:31,163 [INFO] __main__ - Uploaded to S3: topic_extraction/img_15.jpg
|
| 123 |
+
2025-03-02 14:50:31,781 [INFO] __main__ - Uploaded to S3: topic_extraction/img_16.jpg
|
| 124 |
+
2025-03-02 14:50:32,235 [INFO] __main__ - Uploaded to S3: topic_extraction/img_17.jpg
|
| 125 |
+
2025-03-02 14:50:32,803 [INFO] __main__ - Uploaded to S3: topic_extraction/img_18.jpg
|
| 126 |
+
2025-03-02 14:50:33,387 [INFO] __main__ - Uploaded to S3: topic_extraction/img_19.jpg
|
| 127 |
+
2025-03-02 14:50:34,032 [INFO] __main__ - Uploaded to S3: topic_extraction/img_20.jpg
|
| 128 |
+
2025-03-02 14:50:34,618 [INFO] __main__ - Uploaded to S3: topic_extraction/img_21.jpg
|
| 129 |
+
2025-03-02 14:50:35,143 [INFO] __main__ - Uploaded to S3: topic_extraction/img_22.jpg
|
| 130 |
+
2025-03-02 14:50:35,616 [INFO] __main__ - Classifying images to detect tables.
|
| 131 |
+
2025-03-02 14:51:05,394 [INFO] __main__ - Processing table image: topic_extraction/img_1.jpg, columns=three
|
| 132 |
+
2025-03-02 14:51:09,026 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_1_row0_col0.jpg
|
| 133 |
+
2025-03-02 14:51:09,310 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_1_row0_col1.jpg
|
| 134 |
+
2025-03-02 14:51:09,522 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_1_row1_col0.jpg
|
| 135 |
+
2025-03-02 14:51:09,737 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_1_row2_col0.jpg
|
| 136 |
+
2025-03-02 14:51:09,738 [INFO] __main__ - Processing table image: topic_extraction/img_2.jpg, columns=three
|
| 137 |
+
2025-03-02 14:51:13,984 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_2_row0_col0.jpg
|
| 138 |
+
2025-03-02 14:51:14,273 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_2_row0_col1.jpg
|
| 139 |
+
2025-03-02 14:51:14,486 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_2_row1_col0.jpg
|
| 140 |
+
2025-03-02 14:51:14,695 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_2_row2_col0.jpg
|
| 141 |
+
2025-03-02 14:51:14,888 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_2_row2_col1.jpg
|
| 142 |
+
2025-03-02 14:51:15,097 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_2_row3_col0.jpg
|
| 143 |
+
2025-03-02 14:51:15,097 [INFO] __main__ - Processing table image: topic_extraction/img_3.jpg, columns=three
|
| 144 |
+
2025-03-02 14:51:18,329 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_3_row0_col0.jpg
|
| 145 |
+
2025-03-02 14:51:18,633 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_3_row0_col1.jpg
|
| 146 |
+
2025-03-02 14:51:18,633 [INFO] __main__ - Processing table image: topic_extraction/img_4.jpg, columns=three
|
| 147 |
+
2025-03-02 14:51:22,434 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_4_row0_col0.jpg
|
| 148 |
+
2025-03-02 14:51:22,707 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_4_row0_col1.jpg
|
| 149 |
+
2025-03-02 14:51:22,708 [INFO] __main__ - Processing table image: topic_extraction/img_5.jpg, columns=three
|
| 150 |
+
2025-03-02 14:51:26,211 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_5_row0_col0.jpg
|
| 151 |
+
2025-03-02 14:51:26,504 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_5_row0_col1.jpg
|
| 152 |
+
2025-03-02 14:51:26,710 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_5_row1_col0.jpg
|
| 153 |
+
2025-03-02 14:51:26,710 [INFO] __main__ - Processing table image: topic_extraction/img_6.jpg, columns=three
|
| 154 |
+
2025-03-02 14:51:30,866 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_6_row0_col0.jpg
|
| 155 |
+
2025-03-02 14:51:31,155 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_6_row0_col1.jpg
|
| 156 |
+
2025-03-02 14:51:31,156 [INFO] __main__ - Processing table image: topic_extraction/img_7.jpg, columns=three
|
| 157 |
+
2025-03-02 14:51:34,974 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_7_row0_col0.jpg
|
| 158 |
+
2025-03-02 14:51:35,285 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_7_row0_col1.jpg
|
| 159 |
+
2025-03-02 14:51:35,286 [INFO] __main__ - Processing table image: topic_extraction/img_8.jpg, columns=three
|
| 160 |
+
2025-03-02 14:51:39,778 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_8_row0_col0.jpg
|
| 161 |
+
2025-03-02 14:51:40,072 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_8_row0_col1.jpg
|
| 162 |
+
2025-03-02 14:51:40,072 [INFO] __main__ - Processing table image: topic_extraction/img_9.jpg, columns=three
|
| 163 |
+
2025-03-02 14:51:44,011 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_9_row0_col0.jpg
|
| 164 |
+
2025-03-02 14:51:44,302 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_9_row0_col1.jpg
|
| 165 |
+
2025-03-02 14:51:44,531 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_9_row1_col0.jpg
|
| 166 |
+
2025-03-02 14:51:44,775 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_9_row2_col0.jpg
|
| 167 |
+
2025-03-02 14:51:44,961 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_9_row2_col1.jpg
|
| 168 |
+
2025-03-02 14:51:45,180 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_9_row3_col0.jpg
|
| 169 |
+
2025-03-02 14:51:45,181 [INFO] __main__ - Processing table image: topic_extraction/img_10.jpg, columns=three
|
| 170 |
+
2025-03-02 14:51:49,515 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_10_row0_col0.jpg
|
| 171 |
+
2025-03-02 14:51:49,817 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_10_row0_col1.jpg
|
| 172 |
+
2025-03-02 14:51:50,041 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_10_row0_col2.jpg
|
| 173 |
+
2025-03-02 14:51:50,258 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_10_row1_col0.jpg
|
| 174 |
+
2025-03-02 14:51:50,504 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_10_row1_col1.jpg
|
| 175 |
+
2025-03-02 14:51:50,693 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_10_row2_col0.jpg
|
| 176 |
+
2025-03-02 14:51:50,910 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_10_row2_col1.jpg
|
| 177 |
+
2025-03-02 14:51:51,124 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_10_row3_col0.jpg
|
| 178 |
+
2025-03-02 14:51:51,347 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_10_row3_col1.jpg
|
| 179 |
+
2025-03-02 14:51:51,347 [INFO] __main__ - Processing table image: topic_extraction/img_11.jpg, columns=three
|
| 180 |
+
2025-03-02 14:51:53,781 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_11_row0_col0.jpg
|
| 181 |
+
2025-03-02 14:51:54,064 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_11_row0_col1.jpg
|
| 182 |
+
2025-03-02 14:51:54,272 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_11_row1_col0.jpg
|
| 183 |
+
2025-03-02 14:51:54,272 [INFO] __main__ - Processing table image: topic_extraction/img_12.jpg, columns=three
|
| 184 |
+
2025-03-02 14:51:57,509 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_12_row0_col0.jpg
|
| 185 |
+
2025-03-02 14:51:57,808 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_12_row0_col1.jpg
|
| 186 |
+
2025-03-02 14:51:57,808 [INFO] __main__ - Processing table image: topic_extraction/img_13.jpg, columns=three
|
| 187 |
+
2025-03-02 14:52:02,191 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_13_row0_col0.jpg
|
| 188 |
+
2025-03-02 14:52:02,483 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_13_row0_col1.jpg
|
| 189 |
+
2025-03-02 14:52:02,688 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_13_row1_col0.jpg
|
| 190 |
+
2025-03-02 14:52:02,689 [INFO] __main__ - Processing table image: topic_extraction/img_14.jpg, columns=three
|
| 191 |
+
2025-03-02 14:52:06,735 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_14_row0_col0.jpg
|
| 192 |
+
2025-03-02 14:52:07,026 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_14_row0_col1.jpg
|
| 193 |
+
2025-03-02 14:52:07,272 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_14_row1_col0.jpg
|
| 194 |
+
2025-03-02 14:52:07,462 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_14_row2_col0.jpg
|
| 195 |
+
2025-03-02 14:52:07,680 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_14_row2_col1.jpg
|
| 196 |
+
2025-03-02 14:52:07,680 [INFO] __main__ - Processing table image: topic_extraction/img_15.jpg, columns=three
|
| 197 |
+
2025-03-02 14:52:11,136 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_15_row0_col0.jpg
|
| 198 |
+
2025-03-02 14:52:11,398 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_15_row0_col1.jpg
|
| 199 |
+
2025-03-02 14:52:11,588 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_15_row1_col0.jpg
|
| 200 |
+
2025-03-02 14:52:11,803 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_15_row2_col0.jpg
|
| 201 |
+
2025-03-02 14:52:12,023 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_15_row3_col0.jpg
|
| 202 |
+
2025-03-02 14:52:12,023 [INFO] __main__ - Processing table image: topic_extraction/img_16.jpg, columns=three
|
| 203 |
+
2025-03-02 14:52:16,429 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_16_row0_col0.jpg
|
| 204 |
+
2025-03-02 14:52:16,715 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_16_row0_col1.jpg
|
| 205 |
+
2025-03-02 14:52:16,715 [INFO] __main__ - Processing table image: topic_extraction/img_17.jpg, columns=three
|
| 206 |
+
2025-03-02 14:52:19,989 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_17_row0_col0.jpg
|
| 207 |
+
2025-03-02 14:52:20,300 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_17_row0_col1.jpg
|
| 208 |
+
2025-03-02 14:52:20,512 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_17_row1_col0.jpg
|
| 209 |
+
2025-03-02 14:52:20,513 [INFO] __main__ - Processing table image: topic_extraction/img_18.jpg, columns=three
|
| 210 |
+
2025-03-02 14:52:24,235 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_18_row0_col0.jpg
|
| 211 |
+
2025-03-02 14:52:24,536 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_18_row0_col1.jpg
|
| 212 |
+
2025-03-02 14:52:24,744 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_18_row1_col0.jpg
|
| 213 |
+
2025-03-02 14:52:24,744 [INFO] __main__ - Processing table image: topic_extraction/img_19.jpg, columns=three
|
| 214 |
+
2025-03-02 14:52:28,508 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_19_row0_col0.jpg
|
| 215 |
+
2025-03-02 14:52:28,795 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_19_row0_col1.jpg
|
| 216 |
+
2025-03-02 14:52:29,017 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_19_row1_col0.jpg
|
| 217 |
+
2025-03-02 14:52:29,018 [INFO] __main__ - Processing table image: topic_extraction/img_20.jpg, columns=three
|
| 218 |
+
2025-03-02 14:52:33,326 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_20_row0_col0.jpg
|
| 219 |
+
2025-03-02 14:52:33,636 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_20_row0_col1.jpg
|
| 220 |
+
2025-03-02 14:52:33,636 [INFO] __main__ - Processing table image: topic_extraction/img_21.jpg, columns=three
|
| 221 |
+
2025-03-02 14:52:37,439 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_21_row0_col0.jpg
|
| 222 |
+
2025-03-02 14:52:37,439 [INFO] __main__ - Processing table image: topic_extraction/img_22.jpg, columns=three
|
| 223 |
+
2025-03-02 14:52:40,149 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_22_row0_col0.jpg
|
| 224 |
+
2025-03-02 14:52:40,422 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_22_row0_col1.jpg
|
| 225 |
+
2025-03-02 14:52:40,633 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_22_row1_col0.jpg
|
| 226 |
+
2025-03-02 14:52:40,815 [INFO] __main__ - Uploaded to S3: topic_extraction/final_output.md
|
| 227 |
+
2025-03-02 14:52:40,815 [INFO] __main__ - Final markdown uploaded to S3 at topic_extraction/final_output.md
|
| 228 |
+
2025-03-02 14:52:41,142 [INFO] __main__ - GPU memory cleaned up.
|
| 229 |
+
2025-03-02 14:52:41,149 [INFO] __main__ - Processing completed successfully.
|
| 230 |
+
2025-03-02 14:58:58,095 [INFO] __main__ - Processing PDF: /home/user/app/input_output/ocr-specification-economics.pdf
|
| 231 |
+
2025-03-02 14:58:59,186 [INFO] __main__ - Gemini returned subtopics: {'Content of A Level in Economics (H460)': [5, 5], 'Content of Component 1: Microeconomics (H460/01)': [6, 16], 'Content of Component 2: Macroeconomics (H460/02)': [17, 27], 'Content of Component 3: Themes in economics (H460/03)': [28, 28]}
|
| 232 |
+
2025-03-02 14:58:59,197 [INFO] __main__ - Loaded 9752567 bytes from local file '/home/user/app/input_output/ocr-specification-economics.pdf'
|
| 233 |
+
2025-03-02 14:58:59,457 [INFO] __main__ - Occurrences of subtopic 'Content of A Level in Economics (H460)': [2, 10]
|
| 234 |
+
2025-03-02 14:58:59,709 [INFO] __main__ - Occurrences of subtopic 'Content of Component 1: Microeconomics (H460/01)': [2, 11]
|
| 235 |
+
2025-03-02 14:58:59,957 [INFO] __main__ - Occurrences of subtopic 'Content of Component 2: Macroeconomics (H460/02)': [2, 22]
|
| 236 |
+
2025-03-02 14:59:00,181 [INFO] __main__ - Occurrences of subtopic 'Content of Component 3: Themes in economics (H460/03)': [2, 33]
|
| 237 |
+
2025-03-02 14:59:00,181 [INFO] __main__ - Processing pages (0-based): [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33]
|
| 238 |
+
2025-03-02 14:59:33,938 [INFO] __main__ - doc_analyze complete. Extracting images.
|
| 239 |
+
2025-03-02 14:59:36,525 [INFO] __main__ - Uploaded to S3: topic_extraction/img_1.jpg
|
| 240 |
+
2025-03-02 14:59:37,279 [INFO] __main__ - Uploaded to S3: topic_extraction/img_2.jpg
|
| 241 |
+
2025-03-02 14:59:37,821 [INFO] __main__ - Uploaded to S3: topic_extraction/img_3.jpg
|
| 242 |
+
2025-03-02 14:59:38,429 [INFO] __main__ - Uploaded to S3: topic_extraction/img_4.jpg
|
| 243 |
+
2025-03-02 14:59:38,965 [INFO] __main__ - Uploaded to S3: topic_extraction/img_5.jpg
|
| 244 |
+
2025-03-02 14:59:39,595 [INFO] __main__ - Uploaded to S3: topic_extraction/img_6.jpg
|
| 245 |
+
2025-03-02 14:59:40,173 [INFO] __main__ - Uploaded to S3: topic_extraction/img_7.jpg
|
| 246 |
+
2025-03-02 14:59:40,919 [INFO] __main__ - Uploaded to S3: topic_extraction/img_8.jpg
|
| 247 |
+
2025-03-02 14:59:41,487 [INFO] __main__ - Uploaded to S3: topic_extraction/img_9.jpg
|
| 248 |
+
2025-03-02 14:59:42,093 [INFO] __main__ - Uploaded to S3: topic_extraction/img_10.jpg
|
| 249 |
+
2025-03-02 14:59:42,494 [INFO] __main__ - Uploaded to S3: topic_extraction/img_11.jpg
|
| 250 |
+
2025-03-02 14:59:43,010 [INFO] __main__ - Uploaded to S3: topic_extraction/img_12.jpg
|
| 251 |
+
2025-03-02 14:59:43,608 [INFO] __main__ - Uploaded to S3: topic_extraction/img_13.jpg
|
| 252 |
+
2025-03-02 14:59:44,159 [INFO] __main__ - Uploaded to S3: topic_extraction/img_14.jpg
|
| 253 |
+
2025-03-02 14:59:44,653 [INFO] __main__ - Uploaded to S3: topic_extraction/img_15.jpg
|
| 254 |
+
2025-03-02 14:59:45,249 [INFO] __main__ - Uploaded to S3: topic_extraction/img_16.jpg
|
| 255 |
+
2025-03-02 14:59:45,706 [INFO] __main__ - Uploaded to S3: topic_extraction/img_17.jpg
|
| 256 |
+
2025-03-02 14:59:46,272 [INFO] __main__ - Uploaded to S3: topic_extraction/img_18.jpg
|
| 257 |
+
2025-03-02 14:59:46,931 [INFO] __main__ - Uploaded to S3: topic_extraction/img_19.jpg
|
| 258 |
+
2025-03-02 14:59:47,541 [INFO] __main__ - Uploaded to S3: topic_extraction/img_20.jpg
|
| 259 |
+
2025-03-02 14:59:48,107 [INFO] __main__ - Uploaded to S3: topic_extraction/img_21.jpg
|
| 260 |
+
2025-03-02 14:59:48,523 [INFO] __main__ - Uploaded to S3: topic_extraction/img_22.jpg
|
| 261 |
+
2025-03-02 14:59:49,068 [INFO] __main__ - Classifying images to detect tables.
|
| 262 |
+
2025-03-02 15:00:11,610 [INFO] __main__ - Processing table image: topic_extraction/img_1.jpg, columns=three
|
| 263 |
+
2025-03-02 15:00:15,216 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_1_row0_col0.jpg
|
| 264 |
+
2025-03-02 15:00:15,512 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_1_row0_col1.jpg
|
| 265 |
+
2025-03-02 15:00:15,709 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_1_row1_col0.jpg
|
| 266 |
+
2025-03-02 15:00:15,946 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_1_row2_col0.jpg
|
| 267 |
+
2025-03-02 15:00:15,946 [INFO] __main__ - Processing table image: topic_extraction/img_2.jpg, columns=three
|
| 268 |
+
2025-03-02 15:00:20,225 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_2_row0_col0.jpg
|
| 269 |
+
2025-03-02 15:00:20,509 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_2_row0_col1.jpg
|
| 270 |
+
2025-03-02 15:00:20,720 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_2_row1_col0.jpg
|
| 271 |
+
2025-03-02 15:00:20,948 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_2_row2_col0.jpg
|
| 272 |
+
2025-03-02 15:00:21,181 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_2_row2_col1.jpg
|
| 273 |
+
2025-03-02 15:00:21,429 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_2_row3_col0.jpg
|
| 274 |
+
2025-03-02 15:00:21,429 [INFO] __main__ - Processing table image: topic_extraction/img_3.jpg, columns=three
|
| 275 |
+
2025-03-02 15:00:24,720 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_3_row0_col0.jpg
|
| 276 |
+
2025-03-02 15:00:25,026 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_3_row0_col1.jpg
|
| 277 |
+
2025-03-02 15:00:25,026 [INFO] __main__ - Processing table image: topic_extraction/img_4.jpg, columns=three
|
| 278 |
+
2025-03-02 15:00:28,916 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_4_row0_col0.jpg
|
| 279 |
+
2025-03-02 15:00:29,206 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_4_row0_col1.jpg
|
| 280 |
+
2025-03-02 15:00:29,206 [INFO] __main__ - Processing table image: topic_extraction/img_5.jpg, columns=three
|
| 281 |
+
2025-03-02 15:00:32,671 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_5_row0_col0.jpg
|
| 282 |
+
2025-03-02 15:00:32,950 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_5_row0_col1.jpg
|
| 283 |
+
2025-03-02 15:00:33,187 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_5_row1_col0.jpg
|
| 284 |
+
2025-03-02 15:00:33,188 [INFO] __main__ - Processing table image: topic_extraction/img_6.jpg, columns=three
|
| 285 |
+
2025-03-02 15:00:37,317 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_6_row0_col0.jpg
|
| 286 |
+
2025-03-02 15:00:37,615 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_6_row0_col1.jpg
|
| 287 |
+
2025-03-02 15:00:37,616 [INFO] __main__ - Processing table image: topic_extraction/img_7.jpg, columns=three
|
| 288 |
+
2025-03-02 15:00:41,547 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_7_row0_col0.jpg
|
| 289 |
+
2025-03-02 15:00:41,827 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_7_row0_col1.jpg
|
| 290 |
+
2025-03-02 15:00:41,827 [INFO] __main__ - Processing table image: topic_extraction/img_8.jpg, columns=three
|
| 291 |
+
2025-03-02 15:00:45,889 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_8_row0_col0.jpg
|
| 292 |
+
2025-03-02 15:00:46,174 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_8_row0_col1.jpg
|
| 293 |
+
2025-03-02 15:00:46,175 [INFO] __main__ - Processing table image: topic_extraction/img_9.jpg, columns=three
|
| 294 |
+
2025-03-02 15:00:50,348 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_9_row0_col0.jpg
|
| 295 |
+
2025-03-02 15:00:50,645 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_9_row0_col1.jpg
|
| 296 |
+
2025-03-02 15:00:50,867 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_9_row1_col0.jpg
|
| 297 |
+
2025-03-02 15:00:51,082 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_9_row2_col0.jpg
|
| 298 |
+
2025-03-02 15:00:51,323 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_9_row2_col1.jpg
|
| 299 |
+
2025-03-02 15:00:51,538 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_9_row3_col0.jpg
|
| 300 |
+
2025-03-02 15:00:51,538 [INFO] __main__ - Processing table image: topic_extraction/img_10.jpg, columns=three
|
| 301 |
+
2025-03-02 15:00:55,761 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_10_row0_col0.jpg
|
| 302 |
+
2025-03-02 15:00:56,051 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_10_row0_col1.jpg
|
| 303 |
+
2025-03-02 15:00:56,295 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_10_row0_col2.jpg
|
| 304 |
+
2025-03-02 15:00:56,505 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_10_row1_col0.jpg
|
| 305 |
+
2025-03-02 15:00:56,718 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_10_row1_col1.jpg
|
| 306 |
+
2025-03-02 15:00:56,925 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_10_row2_col0.jpg
|
| 307 |
+
2025-03-02 15:00:57,127 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_10_row2_col1.jpg
|
| 308 |
+
2025-03-02 15:00:57,358 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_10_row3_col0.jpg
|
| 309 |
+
2025-03-02 15:00:57,583 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_10_row3_col1.jpg
|
| 310 |
+
2025-03-02 15:00:57,583 [INFO] __main__ - Processing table image: topic_extraction/img_11.jpg, columns=three
|
| 311 |
+
2025-03-02 15:01:00,162 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_11_row0_col0.jpg
|
| 312 |
+
2025-03-02 15:01:00,435 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_11_row0_col1.jpg
|
| 313 |
+
2025-03-02 15:01:00,648 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_11_row1_col0.jpg
|
| 314 |
+
2025-03-02 15:01:00,648 [INFO] __main__ - Processing table image: topic_extraction/img_12.jpg, columns=three
|
| 315 |
+
2025-03-02 15:01:03,879 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_12_row0_col0.jpg
|
| 316 |
+
2025-03-02 15:01:04,133 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_12_row0_col1.jpg
|
| 317 |
+
2025-03-02 15:01:04,133 [INFO] __main__ - Processing table image: topic_extraction/img_13.jpg, columns=three
|
| 318 |
+
2025-03-02 15:01:08,128 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_13_row0_col0.jpg
|
| 319 |
+
2025-03-02 15:01:08,421 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_13_row0_col1.jpg
|
| 320 |
+
2025-03-02 15:01:08,637 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_13_row1_col0.jpg
|
| 321 |
+
2025-03-02 15:01:08,637 [INFO] __main__ - Processing table image: topic_extraction/img_14.jpg, columns=three
|
| 322 |
+
2025-03-02 15:01:12,721 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_14_row0_col0.jpg
|
| 323 |
+
2025-03-02 15:01:13,014 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_14_row0_col1.jpg
|
| 324 |
+
2025-03-02 15:01:13,229 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_14_row1_col0.jpg
|
| 325 |
+
2025-03-02 15:01:13,484 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_14_row2_col0.jpg
|
| 326 |
+
2025-03-02 15:01:13,725 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_14_row2_col1.jpg
|
| 327 |
+
2025-03-02 15:01:13,726 [INFO] __main__ - Processing table image: topic_extraction/img_15.jpg, columns=three
|
| 328 |
+
2025-03-02 15:01:16,869 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_15_row0_col0.jpg
|
| 329 |
+
2025-03-02 15:01:17,179 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_15_row0_col1.jpg
|
| 330 |
+
2025-03-02 15:01:17,395 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_15_row1_col0.jpg
|
| 331 |
+
2025-03-02 15:01:17,610 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_15_row2_col0.jpg
|
| 332 |
+
2025-03-02 15:01:17,827 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_15_row3_col0.jpg
|
| 333 |
+
2025-03-02 15:01:17,827 [INFO] __main__ - Processing table image: topic_extraction/img_16.jpg, columns=three
|
| 334 |
+
2025-03-02 15:01:22,073 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_16_row0_col0.jpg
|
| 335 |
+
2025-03-02 15:01:22,375 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_16_row0_col1.jpg
|
| 336 |
+
2025-03-02 15:01:22,376 [INFO] __main__ - Processing table image: topic_extraction/img_17.jpg, columns=three
|
| 337 |
+
2025-03-02 15:01:25,546 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_17_row0_col0.jpg
|
| 338 |
+
2025-03-02 15:01:25,841 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_17_row0_col1.jpg
|
| 339 |
+
2025-03-02 15:01:26,080 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_17_row1_col0.jpg
|
| 340 |
+
2025-03-02 15:01:26,081 [INFO] __main__ - Processing table image: topic_extraction/img_18.jpg, columns=three
|
| 341 |
+
2025-03-02 15:01:30,055 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_18_row0_col0.jpg
|
| 342 |
+
2025-03-02 15:01:30,330 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_18_row0_col1.jpg
|
| 343 |
+
2025-03-02 15:01:30,557 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_18_row1_col0.jpg
|
| 344 |
+
2025-03-02 15:01:30,557 [INFO] __main__ - Processing table image: topic_extraction/img_19.jpg, columns=three
|
| 345 |
+
2025-03-02 15:01:34,337 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_19_row0_col0.jpg
|
| 346 |
+
2025-03-02 15:01:34,610 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_19_row0_col1.jpg
|
| 347 |
+
2025-03-02 15:01:34,814 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_19_row1_col0.jpg
|
| 348 |
+
2025-03-02 15:01:34,814 [INFO] __main__ - Processing table image: topic_extraction/img_20.jpg, columns=three
|
| 349 |
+
2025-03-02 15:01:39,093 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_20_row0_col0.jpg
|
| 350 |
+
2025-03-02 15:01:39,389 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_20_row0_col1.jpg
|
| 351 |
+
2025-03-02 15:01:39,389 [INFO] __main__ - Processing table image: topic_extraction/img_21.jpg, columns=three
|
| 352 |
+
2025-03-02 15:01:43,178 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_21_row0_col0.jpg
|
| 353 |
+
2025-03-02 15:01:43,179 [INFO] __main__ - Processing table image: topic_extraction/img_22.jpg, columns=three
|
| 354 |
+
2025-03-02 15:01:45,849 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_22_row0_col0.jpg
|
| 355 |
+
2025-03-02 15:01:46,146 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_22_row0_col1.jpg
|
| 356 |
+
2025-03-02 15:01:46,351 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_22_row1_col0.jpg
|
| 357 |
+
2025-03-02 15:01:46,529 [INFO] __main__ - Uploaded to S3: topic_extraction/final_output.md
|
| 358 |
+
2025-03-02 15:01:46,529 [INFO] __main__ - Final markdown uploaded to S3 at topic_extraction/final_output.md
|
| 359 |
+
2025-03-02 15:01:46,530 [INFO] __main__ - Final markdown saved locally at /home/user/app/outputs/final_output_local.md
|
| 360 |
+
2025-03-02 15:01:46,842 [INFO] __main__ - GPU memory cleaned up.
|
| 361 |
+
2025-03-02 15:01:46,849 [INFO] __main__ - Processing completed successfully.
|
| 362 |
+
2025-03-02 15:06:41,893 [INFO] __main__ - Processing PDF: /home/user/app/input_output/ocr-specification-economics.pdf
|
| 363 |
+
2025-03-02 15:06:42,974 [INFO] __main__ - Gemini returned subtopics: {'Content of A Level in Economics (H460)': [5, 5], 'Content of Component 1: Microeconomics (H460/01)': [6, 16], 'Content of Component 2: Macroeconomics (H460/02)': [17, 27], 'Content of Component 3: Themes in economics (H460/03)': [28, 28]}
|
| 364 |
+
2025-03-02 15:06:42,980 [INFO] __main__ - Loaded 9752567 bytes from local file '/home/user/app/input_output/ocr-specification-economics.pdf'
|
| 365 |
+
2025-03-02 15:06:43,152 [INFO] __main__ - Occurrences of subtopic 'Content of A Level in Economics (H460)': [2, 10]
|
| 366 |
+
2025-03-02 15:06:43,298 [INFO] __main__ - Occurrences of subtopic 'Content of Component 1: Microeconomics (H460/01)': [2, 11]
|
| 367 |
+
2025-03-02 15:06:43,444 [INFO] __main__ - Occurrences of subtopic 'Content of Component 2: Macroeconomics (H460/02)': [2, 22]
|
| 368 |
+
2025-03-02 15:06:43,590 [INFO] __main__ - Occurrences of subtopic 'Content of Component 3: Themes in economics (H460/03)': [2, 33]
|
| 369 |
+
2025-03-02 15:06:43,591 [INFO] __main__ - Processing pages (0-based): [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33]
|
| 370 |
+
2025-03-02 15:07:14,912 [INFO] __main__ - doc_analyze complete. Extracting images.
|
| 371 |
+
2025-03-02 15:07:17,601 [INFO] __main__ - Uploaded to S3: topic_extraction/img_1.jpg
|
| 372 |
+
2025-03-02 15:07:18,324 [INFO] __main__ - Uploaded to S3: topic_extraction/img_2.jpg
|
| 373 |
+
2025-03-02 15:07:18,895 [INFO] __main__ - Uploaded to S3: topic_extraction/img_3.jpg
|
| 374 |
+
2025-03-02 15:07:19,484 [INFO] __main__ - Uploaded to S3: topic_extraction/img_4.jpg
|
| 375 |
+
2025-03-02 15:07:20,030 [INFO] __main__ - Uploaded to S3: topic_extraction/img_5.jpg
|
| 376 |
+
2025-03-02 15:07:20,631 [INFO] __main__ - Uploaded to S3: topic_extraction/img_6.jpg
|
| 377 |
+
2025-03-02 15:07:21,213 [INFO] __main__ - Uploaded to S3: topic_extraction/img_7.jpg
|
| 378 |
+
2025-03-02 15:07:21,821 [INFO] __main__ - Uploaded to S3: topic_extraction/img_8.jpg
|
| 379 |
+
2025-03-02 15:07:22,408 [INFO] __main__ - Uploaded to S3: topic_extraction/img_9.jpg
|
| 380 |
+
2025-03-02 15:07:23,030 [INFO] __main__ - Uploaded to S3: topic_extraction/img_10.jpg
|
| 381 |
+
2025-03-02 15:07:23,487 [INFO] __main__ - Uploaded to S3: topic_extraction/img_11.jpg
|
| 382 |
+
2025-03-02 15:07:24,101 [INFO] __main__ - Uploaded to S3: topic_extraction/img_12.jpg
|
| 383 |
+
2025-03-02 15:07:24,728 [INFO] __main__ - Uploaded to S3: topic_extraction/img_13.jpg
|
| 384 |
+
2025-03-02 15:07:25,314 [INFO] __main__ - Uploaded to S3: topic_extraction/img_14.jpg
|
| 385 |
+
2025-03-02 15:07:25,789 [INFO] __main__ - Uploaded to S3: topic_extraction/img_15.jpg
|
| 386 |
+
2025-03-02 15:07:26,385 [INFO] __main__ - Uploaded to S3: topic_extraction/img_16.jpg
|
| 387 |
+
2025-03-02 15:07:26,857 [INFO] __main__ - Uploaded to S3: topic_extraction/img_17.jpg
|
| 388 |
+
2025-03-02 15:07:27,419 [INFO] __main__ - Uploaded to S3: topic_extraction/img_18.jpg
|
| 389 |
+
2025-03-02 15:07:28,000 [INFO] __main__ - Uploaded to S3: topic_extraction/img_19.jpg
|
| 390 |
+
2025-03-02 15:07:28,589 [INFO] __main__ - Uploaded to S3: topic_extraction/img_20.jpg
|
| 391 |
+
2025-03-02 15:07:29,168 [INFO] __main__ - Uploaded to S3: topic_extraction/img_21.jpg
|
| 392 |
+
2025-03-02 15:07:29,673 [INFO] __main__ - Uploaded to S3: topic_extraction/img_22.jpg
|
| 393 |
+
2025-03-02 15:07:30,148 [INFO] __main__ - Classifying images to detect tables.
|
| 394 |
+
2025-03-02 15:07:52,573 [INFO] __main__ - Processing table image: topic_extraction/img_1.jpg, columns=three
|
| 395 |
+
2025-03-02 15:07:55,799 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_1_row0_col0.jpg
|
| 396 |
+
2025-03-02 15:07:56,077 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_1_row0_col1.jpg
|
| 397 |
+
2025-03-02 15:07:56,284 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_1_row1_col0.jpg
|
| 398 |
+
2025-03-02 15:07:56,470 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_1_row2_col0.jpg
|
| 399 |
+
2025-03-02 15:07:56,470 [INFO] __main__ - Processing table image: topic_extraction/img_2.jpg, columns=three
|
| 400 |
+
2025-03-02 15:08:00,547 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_2_row0_col0.jpg
|
| 401 |
+
2025-03-02 15:08:00,838 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_2_row0_col1.jpg
|
| 402 |
+
2025-03-02 15:08:01,053 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_2_row1_col0.jpg
|
| 403 |
+
2025-03-02 15:08:01,255 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_2_row2_col0.jpg
|
| 404 |
+
2025-03-02 15:08:01,480 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_2_row2_col1.jpg
|
| 405 |
+
2025-03-02 15:08:01,704 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_2_row3_col0.jpg
|
| 406 |
+
2025-03-02 15:08:01,704 [INFO] __main__ - Processing table image: topic_extraction/img_3.jpg, columns=three
|
| 407 |
+
2025-03-02 15:08:04,907 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_3_row0_col0.jpg
|
| 408 |
+
2025-03-02 15:08:05,215 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_3_row0_col1.jpg
|
| 409 |
+
2025-03-02 15:08:05,216 [INFO] __main__ - Processing table image: topic_extraction/img_4.jpg, columns=three
|
| 410 |
+
2025-03-02 15:08:08,972 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_4_row0_col0.jpg
|
| 411 |
+
2025-03-02 15:08:09,297 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_4_row0_col1.jpg
|
| 412 |
+
2025-03-02 15:08:09,297 [INFO] __main__ - Processing table image: topic_extraction/img_5.jpg, columns=three
|
| 413 |
+
2025-03-02 15:08:12,704 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_5_row0_col0.jpg
|
| 414 |
+
2025-03-02 15:08:12,991 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_5_row0_col1.jpg
|
| 415 |
+
2025-03-02 15:08:13,203 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_5_row1_col0.jpg
|
| 416 |
+
2025-03-02 15:08:13,204 [INFO] __main__ - Processing table image: topic_extraction/img_6.jpg, columns=three
|
| 417 |
+
2025-03-02 15:08:17,226 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_6_row0_col0.jpg
|
| 418 |
+
2025-03-02 15:08:17,513 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_6_row0_col1.jpg
|
| 419 |
+
2025-03-02 15:08:17,514 [INFO] __main__ - Processing table image: topic_extraction/img_7.jpg, columns=three
|
| 420 |
+
2025-03-02 15:08:21,513 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_7_row0_col0.jpg
|
| 421 |
+
2025-03-02 15:08:21,822 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_7_row0_col1.jpg
|
| 422 |
+
2025-03-02 15:08:21,822 [INFO] __main__ - Processing table image: topic_extraction/img_8.jpg, columns=three
|
| 423 |
+
2025-03-02 15:08:26,051 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_8_row0_col0.jpg
|
| 424 |
+
2025-03-02 15:08:26,349 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_8_row0_col1.jpg
|
| 425 |
+
2025-03-02 15:08:26,350 [INFO] __main__ - Processing table image: topic_extraction/img_9.jpg, columns=three
|
| 426 |
+
2025-03-02 15:08:30,459 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_9_row0_col0.jpg
|
| 427 |
+
2025-03-02 15:08:30,758 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_9_row0_col1.jpg
|
| 428 |
+
2025-03-02 15:08:30,974 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_9_row1_col0.jpg
|
| 429 |
+
2025-03-02 15:08:31,214 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_9_row2_col0.jpg
|
| 430 |
+
2025-03-02 15:08:31,436 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_9_row2_col1.jpg
|
| 431 |
+
2025-03-02 15:08:31,657 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_9_row3_col0.jpg
|
| 432 |
+
2025-03-02 15:08:31,658 [INFO] __main__ - Processing table image: topic_extraction/img_10.jpg, columns=three
|
| 433 |
+
2025-03-02 15:08:35,933 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_10_row0_col0.jpg
|
| 434 |
+
2025-03-02 15:08:36,209 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_10_row0_col1.jpg
|
| 435 |
+
2025-03-02 15:08:36,396 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_10_row0_col2.jpg
|
| 436 |
+
2025-03-02 15:08:36,585 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_10_row1_col0.jpg
|
| 437 |
+
2025-03-02 15:08:36,801 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_10_row1_col1.jpg
|
| 438 |
+
2025-03-02 15:08:37,025 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_10_row2_col0.jpg
|
| 439 |
+
2025-03-02 15:08:37,237 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_10_row2_col1.jpg
|
| 440 |
+
2025-03-02 15:08:37,454 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_10_row3_col0.jpg
|
| 441 |
+
2025-03-02 15:08:37,673 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_10_row3_col1.jpg
|
| 442 |
+
2025-03-02 15:08:37,673 [INFO] __main__ - Processing table image: topic_extraction/img_11.jpg, columns=three
|
| 443 |
+
2025-03-02 15:08:40,182 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_11_row0_col0.jpg
|
| 444 |
+
2025-03-02 15:08:40,477 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_11_row0_col1.jpg
|
| 445 |
+
2025-03-02 15:08:40,700 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_11_row1_col0.jpg
|
| 446 |
+
2025-03-02 15:08:40,701 [INFO] __main__ - Processing table image: topic_extraction/img_12.jpg, columns=three
|
| 447 |
+
2025-03-02 15:08:43,907 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_12_row0_col0.jpg
|
| 448 |
+
2025-03-02 15:08:44,178 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_12_row0_col1.jpg
|
| 449 |
+
2025-03-02 15:08:44,178 [INFO] __main__ - Processing table image: topic_extraction/img_13.jpg, columns=three
|
| 450 |
+
2025-03-02 15:08:48,524 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_13_row0_col0.jpg
|
| 451 |
+
2025-03-02 15:08:48,870 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_13_row0_col1.jpg
|
| 452 |
+
2025-03-02 15:08:49,079 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_13_row1_col0.jpg
|
| 453 |
+
2025-03-02 15:08:49,080 [INFO] __main__ - Processing table image: topic_extraction/img_14.jpg, columns=three
|
| 454 |
+
2025-03-02 15:08:53,612 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_14_row0_col0.jpg
|
| 455 |
+
2025-03-02 15:08:53,897 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_14_row0_col1.jpg
|
| 456 |
+
2025-03-02 15:08:54,116 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_14_row1_col0.jpg
|
| 457 |
+
2025-03-02 15:08:54,352 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_14_row2_col0.jpg
|
| 458 |
+
2025-03-02 15:08:54,578 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_14_row2_col1.jpg
|
| 459 |
+
2025-03-02 15:08:54,579 [INFO] __main__ - Processing table image: topic_extraction/img_15.jpg, columns=three
|
| 460 |
+
2025-03-02 15:08:58,339 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_15_row0_col0.jpg
|
| 461 |
+
2025-03-02 15:08:58,628 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_15_row0_col1.jpg
|
| 462 |
+
2025-03-02 15:08:58,843 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_15_row1_col0.jpg
|
| 463 |
+
2025-03-02 15:08:59,052 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_15_row2_col0.jpg
|
| 464 |
+
2025-03-02 15:08:59,262 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_15_row3_col0.jpg
|
| 465 |
+
2025-03-02 15:08:59,262 [INFO] __main__ - Processing table image: topic_extraction/img_16.jpg, columns=three
|
| 466 |
+
2025-03-02 15:09:04,508 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_16_row0_col0.jpg
|
| 467 |
+
2025-03-02 15:09:04,817 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_16_row0_col1.jpg
|
| 468 |
+
2025-03-02 15:09:04,818 [INFO] __main__ - Processing table image: topic_extraction/img_17.jpg, columns=three
|
| 469 |
+
2025-03-02 15:09:08,615 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_17_row0_col0.jpg
|
| 470 |
+
2025-03-02 15:09:08,905 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_17_row0_col1.jpg
|
| 471 |
+
2025-03-02 15:09:09,111 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_17_row1_col0.jpg
|
| 472 |
+
2025-03-02 15:09:09,111 [INFO] __main__ - Processing table image: topic_extraction/img_18.jpg, columns=three
|
| 473 |
+
2025-03-02 15:09:12,917 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_18_row0_col0.jpg
|
| 474 |
+
2025-03-02 15:09:13,215 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_18_row0_col1.jpg
|
| 475 |
+
2025-03-02 15:09:13,425 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_18_row1_col0.jpg
|
| 476 |
+
2025-03-02 15:09:13,426 [INFO] __main__ - Processing table image: topic_extraction/img_19.jpg, columns=three
|
| 477 |
+
2025-03-02 15:09:17,250 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_19_row0_col0.jpg
|
| 478 |
+
2025-03-02 15:09:17,561 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_19_row0_col1.jpg
|
| 479 |
+
2025-03-02 15:09:17,745 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_19_row1_col0.jpg
|
| 480 |
+
2025-03-02 15:09:17,745 [INFO] __main__ - Processing table image: topic_extraction/img_20.jpg, columns=three
|
| 481 |
+
2025-03-02 15:09:22,139 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_20_row0_col0.jpg
|
| 482 |
+
2025-03-02 15:09:22,430 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_20_row0_col1.jpg
|
| 483 |
+
2025-03-02 15:09:22,431 [INFO] __main__ - Processing table image: topic_extraction/img_21.jpg, columns=three
|
| 484 |
+
2025-03-02 15:09:26,166 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_21_row0_col0.jpg
|
| 485 |
+
2025-03-02 15:09:26,166 [INFO] __main__ - Processing table image: topic_extraction/img_22.jpg, columns=three
|
| 486 |
+
2025-03-02 15:09:29,061 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_22_row0_col0.jpg
|
| 487 |
+
2025-03-02 15:09:29,365 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_22_row0_col1.jpg
|
| 488 |
+
2025-03-02 15:09:29,588 [INFO] __main__ - Uploaded to S3: topic_extraction/cells/img_22_row1_col0.jpg
|
| 489 |
+
2025-03-02 15:09:29,768 [INFO] __main__ - Uploaded to S3: topic_extraction/final_output.md
|
| 490 |
+
2025-03-02 15:09:29,768 [INFO] __main__ - Final markdown uploaded to S3 at topic_extraction/final_output.md
|
| 491 |
+
2025-03-02 15:09:29,768 [INFO] __main__ - Final markdown saved locally at /home/user/app/outputs/final_output_local.md
|
| 492 |
+
2025-03-02 15:09:30,106 [INFO] __main__ - GPU memory cleaned up.
|
| 493 |
+
2025-03-02 15:09:30,113 [INFO] __main__ - Processing completed successfully.
|
| 494 |
+
2025-03-02 15:30:49,001 [INFO] __main__ - Running in test mode: using local image writer.
|
| 495 |
+
2025-03-02 15:30:49,001 [INFO] __main__ - Processing PDF: input_output/a-level-pearson-mathematics-specification.pdf
|
| 496 |
+
2025-03-02 15:30:49,764 [INFO] __main__ - Gemini returned subtopics: {'Paper 1: Pure Mathematics 1': [13, 20], 'Paper 2: Pure Mathematics 2': [21, 27], 'Paper 3: Statistics & Mechanics': [28, 34]}
|
| 497 |
+
2025-03-02 15:30:49,764 [INFO] __main__ - Loaded 1135473 bytes from local file 'input_output/a-level-pearson-mathematics-specification.pdf'
|
| 498 |
+
2025-03-02 15:30:50,022 [INFO] __main__ - Occurrences of subtopic 'Paper 1: Pure Mathematics 1': [8, 44]
|
| 499 |
+
2025-03-02 15:30:50,207 [INFO] __main__ - Occurrences of subtopic 'Paper 2: Pure Mathematics 2': [8, 44]
|
| 500 |
+
2025-03-02 15:30:50,354 [INFO] __main__ - Occurrences of subtopic 'Paper 3: Statistics & Mechanics': []
|
| 501 |
+
2025-03-02 15:30:50,354 [WARNING] __main__ - No suitable occurrence for 'Paper 3: Statistics & Mechanics'. Using page 0.
|
| 502 |
+
2025-03-02 15:30:50,354 [INFO] __main__ - Processing pages (0-based): [27, 28, 29, 30, 31, 32, 33, 44, 45, 46, 47, 48, 49, 50, 51]
|
| 503 |
+
2025-03-02 15:31:22,342 [INFO] __main__ - doc_analyze complete. Extracting images.
|
| 504 |
+
2025-03-02 15:31:25,822 [INFO] __main__ - Classifying images to detect tables.
|
| 505 |
+
2025-03-02 15:31:35,415 [INFO] __main__ - Processing table image => img_1.jpg, columns=three
|
| 506 |
+
2025-03-02 15:31:39,569 [INFO] __main__ - Processing table image => img_2.jpg, columns=three
|
| 507 |
+
2025-03-02 15:31:42,660 [INFO] __main__ - Processing table image => img_3.jpg, columns=three
|
| 508 |
+
2025-03-02 15:31:45,999 [INFO] __main__ - Processing table image => img_4.jpg, columns=three
|
| 509 |
+
2025-03-02 15:31:49,327 [INFO] __main__ - Processing table image => img_5.jpg, columns=three
|
| 510 |
+
2025-03-02 15:31:50,413 [INFO] __main__ - Processing table image => img_6.jpg, columns=three
|
| 511 |
+
2025-03-02 15:31:52,854 [INFO] __main__ - Final JSON saved locally at /home/user/app/outputs/final_output.json
|
| 512 |
+
2025-03-02 15:31:52,854 [INFO] __main__ - Final JSON saved locally at /home/user/app/outputs/final_output_local.json
|
| 513 |
+
2025-03-02 15:31:53,147 [INFO] __main__ - GPU memory cleaned up.
|
| 514 |
+
2025-03-02 15:31:53,150 [INFO] __main__ - Processing completed successfully.
|
| 515 |
+
2025-03-02 15:33:39,987 [INFO] __main__ - Running in test mode: using local image writer.
|
| 516 |
+
2025-03-02 15:33:39,987 [INFO] __main__ - Processing PDF: input_output/a-level-pearson-mathematics-specification.pdf
|
| 517 |
+
2025-03-02 15:33:40,750 [INFO] __main__ - Gemini returned subtopics: {'Paper 1 and Paper 2: Pure Mathematics': [11, 29], 'Paper 3: Statistics and Mechanics': [30, 40]}
|
| 518 |
+
2025-03-02 15:33:40,750 [INFO] __main__ - Loaded 1135473 bytes from local file 'input_output/a-level-pearson-mathematics-specification.pdf'
|
| 519 |
+
2025-03-02 15:33:40,900 [INFO] __main__ - Occurrences of subtopic 'Paper 1 and Paper 2: Pure Mathematics': [2, 3, 4, 14]
|
| 520 |
+
2025-03-02 15:33:41,049 [INFO] __main__ - Occurrences of subtopic 'Paper 3: Statistics and Mechanics': [3, 4, 9, 13, 33, 44]
|
| 521 |
+
2025-03-02 15:33:41,049 [INFO] __main__ - Processing pages (0-based): [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43]
|
| 522 |
+
2025-03-02 15:33:45,815 [INFO] __main__ - GPU memory cleaned up.
|
| 523 |
+
2025-03-02 15:34:54,783 [INFO] __main__ - Running in test mode: using local image writer.
|
| 524 |
+
2025-03-02 15:34:54,783 [INFO] __main__ - Processing PDF: input_output/a-level-pearson-mathematics-specification.pdf
|
| 525 |
+
2025-03-02 15:34:55,467 [INFO] __main__ - Gemini returned subtopics: {'Paper 1 and Paper 2: Pure Mathematics': [11, 29], 'Paper 3: Statistics and Mechanics': [30, 40]}
|
| 526 |
+
2025-03-02 15:34:55,468 [INFO] __main__ - Loaded 1135473 bytes from local file 'input_output/a-level-pearson-mathematics-specification.pdf'
|
| 527 |
+
2025-03-02 15:34:55,725 [INFO] __main__ - Occurrences of subtopic 'Paper 1 and Paper 2: Pure Mathematics': [2, 3, 4, 14]
|
| 528 |
+
2025-03-02 15:34:55,880 [INFO] __main__ - Occurrences of subtopic 'Paper 3: Statistics and Mechanics': [3, 4, 9, 13, 33, 44]
|
| 529 |
+
2025-03-02 15:34:55,880 [INFO] __main__ - Processing pages (0-based): [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43]
|
| 530 |
+
2025-03-02 15:35:50,447 [INFO] __main__ - doc_analyze complete. Extracting images.
|
| 531 |
+
2025-03-02 15:35:59,077 [INFO] __main__ - Classifying images to detect tables.
|
| 532 |
+
2025-03-02 15:36:05,947 [INFO] __main__ - GPU memory cleaned up.
|
| 533 |
+
2025-03-02 15:36:34,852 [INFO] __main__ - Running in test mode: using local image writer.
|
| 534 |
+
2025-03-02 15:36:34,852 [INFO] __main__ - Processing PDF: /home/user/app/input_output/ocr-specification-economics.pdf
|
| 535 |
+
2025-03-02 15:36:35,931 [INFO] __main__ - Gemini returned subtopics: {'Content of A Level in Economics (H460)': [5, 5], 'Content of Component 1: Microeconomics (H460/01)': [6, 16], 'Content of Component 2: Macroeconomics (H460/02)': [17, 27], 'Content of Component 3: Themes in economics (H460/03)': [28, 28]}
|
| 536 |
+
2025-03-02 15:36:35,936 [INFO] __main__ - Loaded 9752567 bytes from local file '/home/user/app/input_output/ocr-specification-economics.pdf'
|
| 537 |
+
2025-03-02 15:36:36,086 [INFO] __main__ - Occurrences of subtopic 'Content of A Level in Economics (H460)': [2, 10]
|
| 538 |
+
2025-03-02 15:36:36,245 [INFO] __main__ - Occurrences of subtopic 'Content of Component 1: Microeconomics (H460/01)': [2, 11]
|
| 539 |
+
2025-03-02 15:36:36,393 [INFO] __main__ - Occurrences of subtopic 'Content of Component 2: Macroeconomics (H460/02)': [2, 22]
|
| 540 |
+
2025-03-02 15:36:36,543 [INFO] __main__ - Occurrences of subtopic 'Content of Component 3: Themes in economics (H460/03)': [2, 33]
|
| 541 |
+
2025-03-02 15:36:36,543 [INFO] __main__ - Processing pages (0-based): [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33]
|
| 542 |
+
2025-03-02 15:36:49,670 [INFO] __main__ - GPU memory cleaned up.
|
| 543 |
+
2025-03-02 15:38:16,408 [INFO] __main__ - Running in test mode: using local image writer.
|
| 544 |
+
2025-03-02 15:38:16,409 [INFO] __main__ - Processing PDF: /home/user/app/input_output/a-level-pearson-mathematics-specification.pdf
|
| 545 |
+
2025-03-02 15:38:17,055 [INFO] __main__ - Gemini returned subtopics: {'Paper 1 and Paper 2: Pure Mathematics': [11, 29], 'Paper 3: Statistics and Mechanics': [30, 40]}
|
| 546 |
+
2025-03-02 15:38:17,056 [INFO] __main__ - Loaded 1135473 bytes from local file '/home/user/app/input_output/a-level-pearson-mathematics-specification.pdf'
|
| 547 |
+
2025-03-02 15:38:17,212 [INFO] __main__ - Occurrences of subtopic 'Paper 1 and Paper 2: Pure Mathematics': [2, 3, 4, 14]
|
| 548 |
+
2025-03-02 15:38:17,360 [INFO] __main__ - Occurrences of subtopic 'Paper 3: Statistics and Mechanics': [3, 4, 9, 13, 33, 44]
|
| 549 |
+
2025-03-02 15:38:17,360 [INFO] __main__ - Processing pages (0-based): [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39]
|
| 550 |
+
2025-03-02 15:38:55,690 [INFO] __main__ - GPU memory cleaned up.
|
| 551 |
+
2025-03-02 15:39:13,884 [INFO] __main__ - Running in test mode: using local image writer.
|
| 552 |
+
2025-03-02 15:39:13,884 [INFO] __main__ - Processing PDF: input_output/a-level-pearson-mathematics-specification.pdf
|
| 553 |
+
2025-03-02 15:39:14,572 [INFO] __main__ - Gemini returned subtopics: {'Paper 1 and Paper 2: Pure Mathematics': [11, 29], 'Paper 3: Statistics and Mechanics': [30, 40]}
|
| 554 |
+
2025-03-02 15:39:14,572 [INFO] __main__ - Loaded 1135473 bytes from local file 'input_output/a-level-pearson-mathematics-specification.pdf'
|
| 555 |
+
2025-03-02 15:39:14,723 [INFO] __main__ - Occurrences of subtopic 'Paper 1 and Paper 2: Pure Mathematics': [2, 3, 4, 14]
|
| 556 |
+
2025-03-02 15:39:14,870 [INFO] __main__ - Occurrences of subtopic 'Paper 3: Statistics and Mechanics': [3, 4, 9, 13, 33, 44]
|
| 557 |
+
2025-03-02 15:39:14,870 [INFO] __main__ - Processing pages (0-based): [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43]
|
| 558 |
+
2025-03-02 15:39:23,660 [INFO] __main__ - GPU memory cleaned up.
|
| 559 |
+
2025-03-02 15:40:12,360 [INFO] __main__ - Running in test mode: using local image writer.
|
| 560 |
+
2025-03-02 15:40:12,361 [INFO] __main__ - Processing PDF: input_output/a-level-pearson-mathematics-specification.pdf
|
| 561 |
+
2025-03-02 15:40:13,030 [INFO] __main__ - Gemini returned subtopics: {'Paper 1 and Paper 2: Pure Mathematics': [11, 29], 'Paper 3: Statistics and Mechanics': [30, 40]}
|
| 562 |
+
2025-03-02 15:40:13,030 [INFO] __main__ - Loaded 1135473 bytes from local file 'input_output/a-level-pearson-mathematics-specification.pdf'
|
| 563 |
+
2025-03-02 15:40:13,180 [INFO] __main__ - Occurrences of subtopic 'Paper 1 and Paper 2: Pure Mathematics': [2, 3, 4, 14]
|
| 564 |
+
2025-03-02 15:40:13,378 [INFO] __main__ - Occurrences of subtopic 'Paper 3: Statistics and Mechanics': [3, 4, 9, 13, 33, 44]
|
| 565 |
+
2025-03-02 15:40:13,379 [INFO] __main__ - Processing pages (0-based): [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43]
|
| 566 |
+
2025-03-02 15:41:08,554 [INFO] __main__ - doc_analyze complete. Extracting images.
|
| 567 |
+
2025-03-02 15:41:17,501 [INFO] __main__ - Classifying images to detect tables.
|
| 568 |
+
2025-03-02 15:41:52,527 [INFO] __main__ - Processing table image => img_1.jpg, columns=three
|
| 569 |
+
2025-03-02 15:41:55,145 [INFO] __main__ - Processing table image => img_2.jpg, columns=three
|
| 570 |
+
2025-03-02 15:41:57,942 [INFO] __main__ - Processing table image => img_3.jpg, columns=three
|
| 571 |
+
2025-03-02 15:42:00,993 [INFO] __main__ - Processing table image => img_4.jpg, columns=three
|
| 572 |
+
2025-03-02 15:42:04,150 [INFO] __main__ - Processing table image => img_5.jpg, columns=three
|
| 573 |
+
2025-03-02 15:42:07,532 [INFO] __main__ - Processing table image => img_6.jpg, columns=three
|
| 574 |
+
2025-03-02 15:42:10,795 [INFO] __main__ - Processing table image => img_7.jpg, columns=three
|
| 575 |
+
2025-03-02 15:42:13,915 [INFO] __main__ - Processing table image => img_8.jpg, columns=three
|
| 576 |
+
2025-03-02 15:42:17,135 [INFO] __main__ - Processing table image => img_9.jpg, columns=three
|
| 577 |
+
2025-03-02 15:42:20,493 [INFO] __main__ - Processing table image => img_10.jpg, columns=three
|
| 578 |
+
2025-03-02 15:42:23,444 [INFO] __main__ - Processing table image => img_11.jpg, columns=two
|
| 579 |
+
2025-03-02 15:42:26,406 [INFO] __main__ - Processing table image => img_12.jpg, columns=three
|
| 580 |
+
2025-03-02 15:42:29,391 [INFO] __main__ - Processing table image => img_13.jpg, columns=three
|
| 581 |
+
2025-03-02 15:42:32,164 [INFO] __main__ - Processing table image => img_14.jpg, columns=three
|
| 582 |
+
2025-03-02 15:42:35,925 [INFO] __main__ - Processing table image => img_15.jpg, columns=three
|
| 583 |
+
2025-03-02 15:42:39,199 [INFO] __main__ - Processing table image => img_16.jpg, columns=three
|
| 584 |
+
2025-03-02 15:42:42,422 [INFO] __main__ - Processing table image => img_17.jpg, columns=three
|
| 585 |
+
2025-03-02 15:42:45,629 [INFO] __main__ - Processing table image => img_18.jpg, columns=three
|
| 586 |
+
2025-03-02 15:42:46,544 [INFO] __main__ - Processing table image => img_19.jpg, columns=three
|
| 587 |
+
2025-03-02 15:42:48,900 [INFO] __main__ - Processing table image => img_20.jpg, columns=three
|
| 588 |
+
2025-03-02 15:42:52,615 [INFO] __main__ - Processing table image => img_21.jpg, columns=three
|
| 589 |
+
2025-03-02 15:42:55,988 [INFO] __main__ - Processing table image => img_22.jpg, columns=three
|
| 590 |
+
2025-03-02 15:42:59,472 [INFO] __main__ - Processing table image => img_23.jpg, columns=three
|
| 591 |
+
2025-03-02 15:43:02,264 [INFO] __main__ - Processing table image => img_24.jpg, columns=three
|
| 592 |
+
2025-03-02 15:43:05,548 [INFO] __main__ - Processing table image => img_25.jpg, columns=two
|
| 593 |
+
2025-03-02 15:43:08,426 [INFO] __main__ - Processing table image => img_26.jpg, columns=three
|
| 594 |
+
2025-03-02 15:43:11,359 [INFO] __main__ - Processing table image => img_27.jpg, columns=three
|
| 595 |
+
2025-03-02 15:43:13,895 [INFO] __main__ - Processing table image => img_28.jpg, columns=two
|
| 596 |
+
2025-03-02 15:43:16,763 [INFO] __main__ - Final JSON saved locally at /home/user/app/outputs/final_output.json
|
| 597 |
+
2025-03-02 15:43:16,763 [INFO] __main__ - Final JSON saved locally at /home/user/app/outputs/final_output_local.json
|
| 598 |
+
2025-03-02 15:43:17,057 [INFO] __main__ - GPU memory cleaned up.
|
| 599 |
+
2025-03-02 15:43:17,064 [INFO] __main__ - Processing completed successfully.
|
| 600 |
+
2025-03-02 15:46:04,987 [INFO] __main__ - Running in test mode: using local image writer.
|
| 601 |
+
2025-03-02 15:46:04,988 [INFO] __main__ - Processing PDF: /home/user/app/input_output/ocr-specification-economics.pdf
|
| 602 |
+
2025-03-02 15:46:06,095 [INFO] __main__ - Gemini returned subtopics: {'Content of A Level in Economics (H460)': [5, 5], 'Content of Component 1: Microeconomics (H460/01)': [6, 16], 'Content of Component 2: Macroeconomics (H460/02)': [17, 27], 'Content of Component 3: Themes in economics (H460/03)': [28, 28]}
|
| 603 |
+
2025-03-02 15:46:06,100 [INFO] __main__ - Loaded 9752567 bytes from local file '/home/user/app/input_output/ocr-specification-economics.pdf'
|
| 604 |
+
2025-03-02 15:46:06,247 [INFO] __main__ - Occurrences of subtopic 'Content of A Level in Economics (H460)': [2, 10]
|
| 605 |
+
2025-03-02 15:46:06,393 [INFO] __main__ - Occurrences of subtopic 'Content of Component 1: Microeconomics (H460/01)': [2, 11]
|
| 606 |
+
2025-03-02 15:46:06,536 [INFO] __main__ - Occurrences of subtopic 'Content of Component 2: Macroeconomics (H460/02)': [2, 22]
|
| 607 |
+
2025-03-02 15:46:06,701 [INFO] __main__ - Occurrences of subtopic 'Content of Component 3: Themes in economics (H460/03)': [2, 33]
|
| 608 |
+
2025-03-02 15:46:06,701 [INFO] __main__ - Processing pages (0-based): [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33]
|
| 609 |
+
2025-03-02 15:46:38,108 [INFO] __main__ - doc_analyze complete. Extracting images.
|
| 610 |
+
2025-03-02 15:46:46,013 [INFO] __main__ - Classifying images to detect tables.
|
| 611 |
+
2025-03-02 15:47:09,314 [INFO] __main__ - Processing table image => img_1.jpg, columns=three
|
| 612 |
+
2025-03-02 15:47:11,825 [INFO] __main__ - Processing table image => img_2.jpg, columns=three
|
| 613 |
+
2025-03-02 15:47:15,360 [INFO] __main__ - Processing table image => img_3.jpg, columns=three
|
| 614 |
+
2025-03-02 15:47:18,359 [INFO] __main__ - Processing table image => img_4.jpg, columns=three
|
| 615 |
+
2025-03-02 15:47:21,998 [INFO] __main__ - Processing table image => img_5.jpg, columns=three
|
| 616 |
+
2025-03-02 15:47:24,942 [INFO] __main__ - Processing table image => img_6.jpg, columns=three
|
| 617 |
+
2025-03-02 15:47:28,526 [INFO] __main__ - Processing table image => img_7.jpg, columns=three
|
| 618 |
+
2025-03-02 15:47:31,741 [INFO] __main__ - Processing table image => img_8.jpg, columns=three
|
| 619 |
+
2025-03-02 15:47:35,515 [INFO] __main__ - Processing table image => img_9.jpg, columns=three
|
| 620 |
+
2025-03-02 15:47:39,076 [INFO] __main__ - Processing table image => img_10.jpg, columns=three
|
| 621 |
+
2025-03-02 15:47:42,855 [INFO] __main__ - Processing table image => img_11.jpg, columns=three
|
| 622 |
+
2025-03-02 15:47:45,035 [INFO] __main__ - Processing table image => img_12.jpg, columns=three
|
| 623 |
+
2025-03-02 15:47:48,119 [INFO] __main__ - Processing table image => img_13.jpg, columns=three
|
| 624 |
+
2025-03-02 15:47:51,846 [INFO] __main__ - Processing table image => img_14.jpg, columns=three
|
| 625 |
+
2025-03-02 15:47:55,356 [INFO] __main__ - Processing table image => img_15.jpg, columns=three
|
| 626 |
+
2025-03-02 15:47:58,009 [INFO] __main__ - Processing table image => img_16.jpg, columns=three
|
| 627 |
+
2025-03-02 15:48:02,296 [INFO] __main__ - Processing table image => img_17.jpg, columns=three
|
| 628 |
+
2025-03-02 15:48:05,441 [INFO] __main__ - Processing table image => img_18.jpg, columns=three
|
| 629 |
+
2025-03-02 15:48:08,620 [INFO] __main__ - Processing table image => img_19.jpg, columns=three
|
| 630 |
+
2025-03-02 15:48:12,128 [INFO] __main__ - Processing table image => img_20.jpg, columns=three
|
| 631 |
+
2025-03-02 15:48:15,972 [INFO] __main__ - Processing table image => img_21.jpg, columns=three
|
| 632 |
+
2025-03-02 15:48:19,326 [INFO] __main__ - Processing table image => img_22.jpg, columns=three
|
| 633 |
+
2025-03-02 15:48:21,707 [INFO] __main__ - Final JSON saved locally at /home/user/app/output/final_output.json
|
| 634 |
+
2025-03-02 15:48:21,707 [INFO] __main__ - Final JSON saved locally at /home/user/app/output/final_output_local.json
|
| 635 |
+
2025-03-02 15:48:22,008 [INFO] __main__ - GPU memory cleaned up.
|
| 636 |
+
2025-03-02 15:48:22,014 [INFO] __main__ - Processing completed successfully.
|
| 637 |
+
2025-03-02 15:51:36,990 [INFO] __main__ - Running in test mode: using local image writer.
|
| 638 |
+
2025-03-02 15:51:36,990 [INFO] __main__ - Processing PDF: /home/user/app/input_output/aqa-Mathematics-specification.pdf
|
| 639 |
+
2025-03-02 15:51:38,323 [INFO] __main__ - Gemini returned subtopics: {'Paper 1 and Paper 2: Pure Mathematics': [11, 11], '1 Proof': [11, 11], '2 Algebra and \nfunctions': [12, 15], '3 Coordinate \ngeometry in \nthe (x,y) plane': [16, 17], '4 Sequences \nand series': [17, 18], '5 Trigonometry': [19, 20]}
|
| 640 |
+
2025-03-02 15:51:38,323 [INFO] __main__ - Loaded 888895 bytes from local file '/home/user/app/input_output/aqa-Mathematics-specification.pdf'
|
| 641 |
+
2025-03-02 15:51:38,484 [INFO] __main__ - Occurrences of subtopic 'Paper 1 and Paper 2: Pure Mathematics': [0]
|
| 642 |
+
2025-03-02 15:51:38,485 [WARNING] __main__ - No suitable occurrence for 'Paper 1 and Paper 2: Pure Mathematics'. Using page 0.
|
| 643 |
+
2025-03-02 15:51:38,612 [INFO] __main__ - Occurrences of subtopic '1 Proof': [0]
|
| 644 |
+
2025-03-02 15:51:38,612 [WARNING] __main__ - No suitable occurrence for '1 Proof'. Using page 0.
|
| 645 |
+
2025-03-02 15:51:38,737 [INFO] __main__ - Occurrences of subtopic '2 Algebra and
|
| 646 |
+
functions': [1, 2, 3, 4, 5]
|
| 647 |
+
2025-03-02 15:51:38,738 [WARNING] __main__ - No suitable occurrence for '2 Algebra and
|
| 648 |
+
functions'. Using page 5.
|
| 649 |
+
2025-03-02 15:51:38,873 [INFO] __main__ - Occurrences of subtopic '3 Coordinate
|
| 650 |
+
geometry in
|
| 651 |
+
the (x,y) plane': [5]
|
| 652 |
+
2025-03-02 15:51:38,874 [WARNING] __main__ - No suitable occurrence for '3 Coordinate
|
| 653 |
+
geometry in
|
| 654 |
+
the (x,y) plane'. Using page 5.
|
| 655 |
+
2025-03-02 15:51:39,032 [INFO] __main__ - Occurrences of subtopic '4 Sequences
|
| 656 |
+
and series': [6, 7]
|
| 657 |
+
2025-03-02 15:51:39,032 [WARNING] __main__ - No suitable occurrence for '4 Sequences
|
| 658 |
+
and series'. Using page 7.
|
| 659 |
+
2025-03-02 15:51:39,158 [INFO] __main__ - Occurrences of subtopic '5 Trigonometry': [8, 9]
|
| 660 |
+
2025-03-02 15:51:39,159 [WARNING] __main__ - No suitable occurrence for '5 Trigonometry'. Using page 9.
|
| 661 |
+
2025-03-02 15:51:39,159 [INFO] __main__ - Processing pages (0-based): [10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
|
| 662 |
+
2025-03-02 15:52:09,390 [INFO] __main__ - doc_analyze complete. Extracting images.
|
| 663 |
+
2025-03-02 15:52:13,176 [INFO] __main__ - Classifying images to detect tables.
|
| 664 |
+
2025-03-02 15:52:22,888 [INFO] __main__ - Processing table image => img_1.jpg, columns=two
|
| 665 |
+
2025-03-02 15:52:25,632 [INFO] __main__ - Processing table image => img_2.jpg, columns=three
|
| 666 |
+
2025-03-02 15:52:28,498 [INFO] __main__ - Processing table image => img_3.jpg, columns=three
|
| 667 |
+
2025-03-02 15:52:31,350 [INFO] __main__ - Processing table image => img_4.jpg, columns=three
|
| 668 |
+
2025-03-02 15:52:35,021 [INFO] __main__ - Processing table image => img_5.jpg, columns=three
|
| 669 |
+
2025-03-02 15:52:38,559 [INFO] __main__ - Processing table image => img_6.jpg, columns=three
|
| 670 |
+
2025-03-02 15:52:42,236 [INFO] __main__ - Processing table image => img_7.jpg, columns=three
|
| 671 |
+
2025-03-02 15:52:45,343 [INFO] __main__ - Processing table image => img_8.jpg, columns=three
|
| 672 |
+
2025-03-02 15:52:46,233 [INFO] __main__ - Processing table image => img_9.jpg, columns=three
|
| 673 |
+
2025-03-02 15:52:48,663 [INFO] __main__ - Final JSON saved locally at /home/user/app/outpu_m/final_output.json
|
| 674 |
+
2025-03-02 15:52:48,663 [INFO] __main__ - Final JSON saved locally at /home/user/app/outpu_m/final_output_local.json
|
| 675 |
+
2025-03-02 15:52:48,989 [INFO] __main__ - GPU memory cleaned up.
|
| 676 |
+
2025-03-02 15:52:48,994 [INFO] __main__ - Processing completed successfully.
|
| 677 |
+
2025-03-02 15:55:31,322 [INFO] __main__ - Running in test mode: using local image writer.
|
| 678 |
+
2025-03-02 15:55:31,323 [INFO] __main__ - Processing PDF: /home/user/app/input_output/wjec-as-a-economics-specification-from-2015.pdf
|
| 679 |
+
2025-03-02 15:55:32,172 [INFO] __main__ - Gemini returned subtopics: {'2.1 AS units': [7, 22], '2.2 A2 units': [23, 43]}
|
| 680 |
+
2025-03-02 15:55:32,174 [INFO] __main__ - Loaded 3543551 bytes from local file '/home/user/app/input_output/wjec-as-a-economics-specification-from-2015.pdf'
|
| 681 |
+
2025-03-02 15:55:32,396 [INFO] __main__ - Occurrences of subtopic '2.1 AS units': []
|
| 682 |
+
2025-03-02 15:55:32,396 [WARNING] __main__ - No suitable occurrence for '2.1 AS units'. Using page 0.
|
| 683 |
+
2025-03-02 15:55:32,610 [INFO] __main__ - Occurrences of subtopic '2.2 A2 units': []
|
| 684 |
+
2025-03-02 15:55:32,610 [WARNING] __main__ - No suitable occurrence for '2.2 A2 units'. Using page 0.
|
| 685 |
+
2025-03-02 15:55:32,611 [INFO] __main__ - Processing pages (0-based): [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42]
|
| 686 |
+
2025-03-02 15:55:50,803 [INFO] __main__ - GPU memory cleaned up.
|
| 687 |
+
2025-03-02 15:56:57,528 [INFO] __main__ - Running in test mode: using local image writer.
|
| 688 |
+
2025-03-02 15:56:57,529 [INFO] __main__ - Processing PDF: /home/user/app/input_output/wjec-gce-maths-spec-from-2017-e.pdf
|
| 689 |
+
2025-03-02 15:56:58,358 [INFO] __main__ - Gemini returned subtopics: {'2.1 AS Unit 1': [11, 17], '2.2 AS Unit 2': [18, 23], '2.3 A2 Unit 3': [24, 30], '2.4 A2 Unit 4': [31, 35]}
|
| 690 |
+
2025-03-02 15:56:58,359 [INFO] __main__ - Loaded 1510568 bytes from local file '/home/user/app/input_output/wjec-gce-maths-spec-from-2017-e.pdf'
|
| 691 |
+
2025-03-02 15:56:58,658 [INFO] __main__ - Occurrences of subtopic '2.1 AS Unit 1': [3]
|
| 692 |
+
2025-03-02 15:56:58,659 [WARNING] __main__ - No suitable occurrence for '2.1 AS Unit 1'. Using page 3.
|
| 693 |
+
2025-03-02 15:56:58,982 [INFO] __main__ - Occurrences of subtopic '2.2 AS Unit 2': [3]
|
| 694 |
+
2025-03-02 15:56:58,983 [WARNING] __main__ - No suitable occurrence for '2.2 AS Unit 2'. Using page 3.
|
| 695 |
+
2025-03-02 15:56:59,291 [INFO] __main__ - Occurrences of subtopic '2.3 A2 Unit 3': [3]
|
| 696 |
+
2025-03-02 15:56:59,291 [WARNING] __main__ - No suitable occurrence for '2.3 A2 Unit 3'. Using page 3.
|
| 697 |
+
2025-03-02 15:56:59,542 [INFO] __main__ - Occurrences of subtopic '2.4 A2 Unit 4': [3]
|
| 698 |
+
2025-03-02 15:56:59,542 [WARNING] __main__ - No suitable occurrence for '2.4 A2 Unit 4'. Using page 3.
|
| 699 |
+
2025-03-02 15:56:59,542 [INFO] __main__ - Processing pages (0-based): [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34]
|
| 700 |
+
2025-03-02 15:57:41,369 [INFO] __main__ - GPU memory cleaned up.
|
| 701 |
+
2025-03-02 15:57:41,370 [ERROR] __main__ - Processing failed: CUDA out of memory. Tried to allocate 756.00 MiB. GPU
|
| 702 |
+
2025-03-02 15:58:10,089 [INFO] __main__ - Running in test mode: using local image writer.
|
| 703 |
+
2025-03-02 15:58:10,089 [INFO] __main__ - Processing PDF: /home/user/app/input_output/wjec-gce-maths-spec-from-2017-e.pdf
|
| 704 |
+
2025-03-02 15:58:10,963 [INFO] __main__ - Gemini returned subtopics: {'2.1 AS Unit 1': [11, 17], '2.2 AS Unit 2': [18, 23], '2.3 A2 Unit 3': [24, 30], '2.4 A2 Unit 4': [31, 35]}
|
| 705 |
+
2025-03-02 15:58:10,964 [INFO] __main__ - Loaded 1510568 bytes from local file '/home/user/app/input_output/wjec-gce-maths-spec-from-2017-e.pdf'
|
| 706 |
+
2025-03-02 15:58:11,265 [INFO] __main__ - Occurrences of subtopic '2.1 AS Unit 1': [3]
|
| 707 |
+
2025-03-02 15:58:11,265 [WARNING] __main__ - No suitable occurrence for '2.1 AS Unit 1'. Using page 3.
|
| 708 |
+
2025-03-02 15:58:11,464 [INFO] __main__ - Occurrences of subtopic '2.2 AS Unit 2': [3]
|
| 709 |
+
2025-03-02 15:58:11,465 [WARNING] __main__ - No suitable occurrence for '2.2 AS Unit 2'. Using page 3.
|
| 710 |
+
2025-03-02 15:58:11,652 [INFO] __main__ - Occurrences of subtopic '2.3 A2 Unit 3': [3]
|
| 711 |
+
2025-03-02 15:58:11,652 [WARNING] __main__ - No suitable occurrence for '2.3 A2 Unit 3'. Using page 3.
|
| 712 |
+
2025-03-02 15:58:11,838 [INFO] __main__ - Occurrences of subtopic '2.4 A2 Unit 4': [3]
|
| 713 |
+
2025-03-02 15:58:11,838 [WARNING] __main__ - No suitable occurrence for '2.4 A2 Unit 4'. Using page 3.
|
| 714 |
+
2025-03-02 15:58:11,839 [INFO] __main__ - Processing pages (0-based): [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34]
|
| 715 |
+
2025-03-02 15:58:29,224 [INFO] __main__ - GPU memory cleaned up.
|
| 716 |
+
2025-03-02 15:58:29,225 [ERROR] __main__ - Processing failed: CUDA out of memory. Tried to allocate 148.00 MiB. GPU
|
| 717 |
+
2025-03-02 15:59:09,686 [INFO] __main__ - Running in test mode: using local image writer.
|
| 718 |
+
2025-03-02 15:59:09,687 [INFO] __main__ - Processing PDF: /home/user/app/input_output/wjec-gce-maths-spec-from-2017-e.pdf
|
| 719 |
+
2025-03-02 15:59:10,528 [INFO] __main__ - Gemini returned subtopics: {'2.1 AS Unit 1': [11, 17], '2.2 AS Unit 2': [18, 23], '2.3 A2 Unit 3': [24, 30], '2.4 A2 Unit 4': [31, 35]}
|
| 720 |
+
2025-03-02 15:59:10,529 [INFO] __main__ - Loaded 1510568 bytes from local file '/home/user/app/input_output/wjec-gce-maths-spec-from-2017-e.pdf'
|
| 721 |
+
2025-03-02 15:59:10,803 [INFO] __main__ - Occurrences of subtopic '2.1 AS Unit 1': [3]
|
| 722 |
+
2025-03-02 15:59:10,804 [WARNING] __main__ - No suitable occurrence for '2.1 AS Unit 1'. Using page 3.
|
| 723 |
+
2025-03-02 15:59:11,115 [INFO] __main__ - Occurrences of subtopic '2.2 AS Unit 2': [3]
|
| 724 |
+
2025-03-02 15:59:11,116 [WARNING] __main__ - No suitable occurrence for '2.2 AS Unit 2'. Using page 3.
|
| 725 |
+
2025-03-02 15:59:11,424 [INFO] __main__ - Occurrences of subtopic '2.3 A2 Unit 3': [3]
|
| 726 |
+
2025-03-02 15:59:11,424 [WARNING] __main__ - No suitable occurrence for '2.3 A2 Unit 3'. Using page 3.
|
| 727 |
+
2025-03-02 15:59:11,730 [INFO] __main__ - Occurrences of subtopic '2.4 A2 Unit 4': [3]
|
| 728 |
+
2025-03-02 15:59:11,730 [WARNING] __main__ - No suitable occurrence for '2.4 A2 Unit 4'. Using page 3.
|
| 729 |
+
2025-03-02 15:59:11,730 [INFO] __main__ - Processing pages (0-based): [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34]
|
| 730 |
+
2025-03-02 15:59:48,050 [INFO] __main__ - GPU memory cleaned up.
|
topic_extraction.py
CHANGED
|
@@ -77,14 +77,19 @@ class s3Writer:
|
|
| 77 |
)
|
| 78 |
|
| 79 |
def write(self, path: str, data: bytes) -> None:
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
file_obj
|
| 83 |
-
self.
|
| 84 |
-
|
| 85 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
|
| 87 |
-
#reduce img size, save time for gemini call
|
| 88 |
def preprocess_image(image_data: bytes, max_dim: int = 600, quality: int = 60) -> bytes:
|
| 89 |
arr = np.frombuffer(image_data, np.uint8)
|
| 90 |
img = cv2.imdecode(arr, cv2.IMREAD_COLOR)
|
|
@@ -164,6 +169,7 @@ async def classify_image_async(image_data: bytes, api_key: str, max_retries: int
|
|
| 164 |
class S3ImageWriter(DataWriter):
|
| 165 |
def __init__(self, s3_writer: s3Writer, base_path: str, gemini_api_key: str):
|
| 166 |
self.s3_writer = s3_writer
|
|
|
|
| 167 |
self.base_path = base_path if base_path.endswith("/") else base_path + "/"
|
| 168 |
self.gemini_api_key = gemini_api_key
|
| 169 |
self.descriptions = {}
|
|
@@ -207,20 +213,16 @@ class S3ImageWriter(DataWriter):
|
|
| 207 |
|
| 208 |
md_content = await self._process_table_images_in_markdown(key, md_content)
|
| 209 |
final_lines = []
|
| 210 |
-
|
| 211 |
for line in md_content.split("\n"):
|
| 212 |
if re.match(r"^\!\[.*\]\(.*\)", line.strip()):
|
| 213 |
final_lines.append(line.strip())
|
| 214 |
-
|
| 215 |
return "\n".join(final_lines)
|
| 216 |
|
| 217 |
async def _process_table_images_in_markdown(self, key: str, md_content: str) -> str:
|
| 218 |
pat = r"!\[HAS TO BE PROCESSED - (two|three) column table\]\(([^)]+)\)"
|
| 219 |
matches = re.findall(pat, md_content, flags=re.IGNORECASE)
|
| 220 |
-
|
| 221 |
if not matches:
|
| 222 |
return md_content
|
| 223 |
-
|
| 224 |
for (col_type, s3_key) in matches:
|
| 225 |
logger.info(f"Processing table image: {s3_key}, columns={col_type}")
|
| 226 |
img_data = None
|
|
@@ -250,19 +252,15 @@ class S3ImageWriter(DataWriter):
|
|
| 250 |
subtopic_threshold=0.2
|
| 251 |
)
|
| 252 |
row_boxes = extractor.process_image(temp_path)
|
| 253 |
-
|
| 254 |
snippet = ["**Extracted table cells:**"]
|
| 255 |
-
|
| 256 |
for i, row in enumerate(row_boxes):
|
| 257 |
for j, _ in enumerate(row):
|
| 258 |
cell_unique_key = f"{self.base_path}cells/{os.path.basename(s3_key).split('.')[0]}_row{i}_col{j}.jpg"
|
| 259 |
self.s3_writer.write(cell_unique_key, img_data)
|
| 260 |
snippet.append(f"")
|
| 261 |
new_snip = "\n".join(snippet)
|
| 262 |
-
|
| 263 |
old_line = f""
|
| 264 |
md_content = md_content.replace(old_line, new_snip)
|
| 265 |
-
|
| 266 |
except Exception as e:
|
| 267 |
logger.error(f"Error processing table image {s3_key}: {e}")
|
| 268 |
finally:
|
|
@@ -274,7 +272,7 @@ class S3ImageWriter(DataWriter):
|
|
| 274 |
|
| 275 |
def post_process(self, key: str, md_content: str) -> str:
|
| 276 |
return asyncio.run(self.post_process_async(key, md_content))
|
| 277 |
-
|
| 278 |
class LocalImageWriter(DataWriter):
|
| 279 |
def __init__(self, output_folder: str, gemini_api_key: str):
|
| 280 |
self.output_folder = output_folder
|
|
@@ -282,6 +280,8 @@ class LocalImageWriter(DataWriter):
|
|
| 282 |
self.descriptions = {}
|
| 283 |
self._img_count = 0
|
| 284 |
self.gemini_api_key = gemini_api_key
|
|
|
|
|
|
|
| 285 |
|
| 286 |
def write(self, path: str, data: bytes) -> None:
|
| 287 |
self._img_count += 1
|
|
@@ -292,13 +292,16 @@ class LocalImageWriter(DataWriter):
|
|
| 292 |
"table_classification": "NO_TABLE",
|
| 293 |
"final_alt": ""
|
| 294 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 295 |
|
| 296 |
async def post_process_async(self, key: str, md_content: str) -> str:
|
| 297 |
logger.info("Classifying images to detect tables.")
|
| 298 |
tasks = []
|
| 299 |
for p, info in self.descriptions.items():
|
| 300 |
tasks.append((p, classify_image_async(info["data"], self.gemini_api_key)))
|
| 301 |
-
|
| 302 |
for p, task in tasks:
|
| 303 |
try:
|
| 304 |
classification = await task
|
|
@@ -306,7 +309,6 @@ class LocalImageWriter(DataWriter):
|
|
| 306 |
except Exception as e:
|
| 307 |
logger.error(f"Table classification error: {e}")
|
| 308 |
self.descriptions[p]['table_classification'] = "NO_TABLE"
|
| 309 |
-
|
| 310 |
for p, info in self.descriptions.items():
|
| 311 |
cls = info['table_classification']
|
| 312 |
if cls == "TWO_COLUMN":
|
|
@@ -316,10 +318,8 @@ class LocalImageWriter(DataWriter):
|
|
| 316 |
else:
|
| 317 |
info['final_alt'] = "NO_TABLE image"
|
| 318 |
md_content = md_content.replace(f"", f"![{info['final_alt']}]({info['relative_path']})")
|
| 319 |
-
|
| 320 |
md_content = self._process_table_images_in_markdown(md_content)
|
| 321 |
final_lines = []
|
| 322 |
-
|
| 323 |
for line in md_content.split("\n"):
|
| 324 |
if re.match(r"^\!\[.*\]\(.*\)", line.strip()):
|
| 325 |
final_lines.append(line.strip())
|
|
@@ -330,7 +330,6 @@ class LocalImageWriter(DataWriter):
|
|
| 330 |
matches = re.findall(pat, md_content, flags=re.IGNORECASE)
|
| 331 |
if not matches:
|
| 332 |
return md_content
|
| 333 |
-
|
| 334 |
for (col_type, image_id) in matches:
|
| 335 |
logger.info(f"Processing table image => {image_id}, columns={col_type}")
|
| 336 |
temp_path = os.path.join(self.output_folder, image_id)
|
|
@@ -361,12 +360,17 @@ class LocalImageWriter(DataWriter):
|
|
| 361 |
subtopic_threshold=0.2
|
| 362 |
)
|
| 363 |
row_boxes = extractor.process_image(temp_path)
|
| 364 |
-
|
| 365 |
out_folder = temp_path + "_rows"
|
| 366 |
os.makedirs(out_folder, exist_ok=True)
|
| 367 |
-
|
| 368 |
extractor.save_extracted_cells(temp_path, row_boxes, out_folder)
|
| 369 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 370 |
snippet = ["**Extracted table cells:**"]
|
| 371 |
for i, row in enumerate(row_boxes):
|
| 372 |
row_dir = os.path.join(out_folder, f"row_{i}")
|
|
@@ -378,7 +382,6 @@ class LocalImageWriter(DataWriter):
|
|
| 378 |
new_snip = "\n".join(snippet)
|
| 379 |
old_line = f""
|
| 380 |
md_content = md_content.replace(old_line, new_snip)
|
| 381 |
-
|
| 382 |
except Exception as e:
|
| 383 |
logger.error(f"Error processing table image {image_id}: {e}")
|
| 384 |
finally:
|
|
@@ -390,7 +393,7 @@ class LocalImageWriter(DataWriter):
|
|
| 390 |
return asyncio.run(self.post_process_async(key, md_content))
|
| 391 |
|
| 392 |
class GeminiTopicExtractor:
|
| 393 |
-
def __init__(self, api_key: str = None, num_pages: int =
|
| 394 |
self.api_key = api_key or os.getenv("GEMINI_API_KEY", "")
|
| 395 |
self.num_pages = num_pages
|
| 396 |
|
|
@@ -399,10 +402,8 @@ class GeminiTopicExtractor:
|
|
| 399 |
if not first_pages_text.strip():
|
| 400 |
logger.error("No text from first pages => cannot extract subtopics.")
|
| 401 |
return {}
|
| 402 |
-
|
| 403 |
prompt = f"""
|
| 404 |
You have the first pages of a PDF specification, including a table of contents.
|
| 405 |
-
|
| 406 |
Instructions:
|
| 407 |
1. Identify the 'Contents' section listing all topics, subtopics, and their corresponding pages.
|
| 408 |
2. Identify the major academic subtopics (common desired topic names "Paper X", "Theme X", "Content of X", "AS Unit X", "A2 Unit X", or similar headings).
|
|
@@ -413,15 +414,11 @@ Instructions:
|
|
| 413 |
"Subtopic B": [start_page, end_page]
|
| 414 |
}}
|
| 415 |
5. If you can't find any subtopics, return an empty JSON.
|
| 416 |
-
|
| 417 |
Important notes:
|
| 418 |
- The correct "end_page" must be the page number of the next topic or subtopic minus 1.
|
| 419 |
- The final output must be valid JSON only, with no extra text or code blocks.
|
| 420 |
-
|
| 421 |
Examples:
|
| 422 |
-
|
| 423 |
1. Given this table of contents:
|
| 424 |
-
|
| 425 |
1 Introduction – 2
|
| 426 |
Why choose Edexcel A Level Mathematics? - 2
|
| 427 |
Supporting you in planning and implementing this qualification - 3
|
|
@@ -442,16 +439,12 @@ Appendix 5: The context for the development of this qualification – 62
|
|
| 442 |
Appendix 6: Transferable skills – 64
|
| 443 |
Appendix 7: Level 3 Extended Project qualification – 65
|
| 444 |
Appendix 8: Codes – 67
|
| 445 |
-
|
| 446 |
The correct output should be:
|
| 447 |
-
|
| 448 |
{{
|
| 449 |
"Paper 1 and Paper 2: Pure Mathematics": [11, 29],
|
| 450 |
"Paper 3: Statistics and Mechanics": [30, 42]
|
| 451 |
}}
|
| 452 |
-
|
| 453 |
2. Given this table of contents:
|
| 454 |
-
|
| 455 |
Qualification at a glance – 1
|
| 456 |
Assessment Objectives and weightings - 4
|
| 457 |
Knowledge, skills and understanding – 5
|
|
@@ -480,34 +473,26 @@ Appendix 2: Level 3 Extended Project qualification – 55
|
|
| 480 |
Appendix 3: Quantitative skills – 59
|
| 481 |
Appendix 4: Codes – 61
|
| 482 |
Appendix 5: Index – 63
|
| 483 |
-
|
| 484 |
The correct output should be:
|
| 485 |
-
|
| 486 |
{{
|
| 487 |
"Theme 1: Introduction to markets and market failure": [5, 10],
|
| 488 |
"Theme 2: The UK economy – performance and policies": [11, 20],
|
| 489 |
"Theme 3: Business behaviour and the labour market": [21, 28],
|
| 490 |
"Theme 4: A global perspective": [29, 38]
|
| 491 |
}}
|
| 492 |
-
|
| 493 |
3. You might also see sections like:
|
| 494 |
-
|
| 495 |
2.1 AS Unit 1 11
|
| 496 |
2.2 AS Unit 2 18
|
| 497 |
2.3 A2 Unit 3 24
|
| 498 |
2.4 A2 Unit 4 31
|
| 499 |
-
|
| 500 |
In that scenario, your output might look like:
|
| 501 |
-
|
| 502 |
{{
|
| 503 |
"2.1 AS Unit 1": [11, 17],
|
| 504 |
"2.2 AS Unit 2": [18, 23],
|
| 505 |
"2.3 A2 Unit 3": [24, 30],
|
| 506 |
"2.4 A2 Unit 4": [31, 35]
|
| 507 |
}}
|
| 508 |
-
|
| 509 |
4. Another example might list subtopics:
|
| 510 |
-
|
| 511 |
3.1 Overarching themes 11
|
| 512 |
3.2 A: Proof 12
|
| 513 |
3.3 B: Algebra and functions 13
|
|
@@ -529,15 +514,12 @@ In that scenario, your output might look like:
|
|
| 529 |
3.19 R: Forces and Newton’s laws 24
|
| 530 |
3.20 S: Moments 25
|
| 531 |
3.21 Use of data in statistics 26
|
| 532 |
-
|
| 533 |
Here the correct output might look like:
|
| 534 |
-
|
| 535 |
{{
|
| 536 |
"A: Proof": [12, 12],
|
| 537 |
"B: Algebra and functions": [13, 13],
|
| 538 |
...
|
| 539 |
}}
|
| 540 |
-
|
| 541 |
Now, extract topics from this text:
|
| 542 |
{first_pages_text}
|
| 543 |
"""
|
|
@@ -545,7 +527,6 @@ Now, extract topics from this text:
|
|
| 545 |
if _GEMINI_CLIENT is None:
|
| 546 |
_GEMINI_CLIENT = genai.Client(api_key=self.api_key)
|
| 547 |
client = _GEMINI_CLIENT
|
| 548 |
-
|
| 549 |
try:
|
| 550 |
response = client.models.generate_content(
|
| 551 |
model="gemini-2.0-flash",
|
|
@@ -613,20 +594,22 @@ class MineruNoTextProcessor:
|
|
| 613 |
self.language = "en"
|
| 614 |
self.subtopic_extractor = GeminiTopicExtractor(api_key=gemini_api_key, num_pages=10)
|
| 615 |
self.gemini_api_key = gemini_api_key or os.getenv("GEMINI_API_KEY", "")
|
| 616 |
-
|
| 617 |
-
if
|
| 618 |
-
|
| 619 |
-
self.use_s3 = True
|
| 620 |
-
self.s3_writer = s3Writer(
|
| 621 |
-
ak=os.getenv("S3_ACCESS_KEY"),
|
| 622 |
-
sk=os.getenv("S3_SECRET_KEY"),
|
| 623 |
-
bucket=os.getenv("S3_BUCKET_NAME"),
|
| 624 |
-
endpoint_url=os.getenv("S3_ENDPOINT")
|
| 625 |
-
)
|
| 626 |
-
self.base_path = "topic_extraction/"
|
| 627 |
-
else:
|
| 628 |
self.use_s3 = False
|
| 629 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 630 |
|
| 631 |
def cleanup_gpu(self):
|
| 632 |
try:
|
|
@@ -636,12 +619,11 @@ class MineruNoTextProcessor:
|
|
| 636 |
except Exception as e:
|
| 637 |
logger.error(f"Error during GPU cleanup: {e}")
|
| 638 |
|
| 639 |
-
def process(self, pdf_path: str) -> str:
|
| 640 |
logger.info(f"Processing PDF: {pdf_path}")
|
| 641 |
try:
|
| 642 |
subtopics = self.subtopic_extractor.extract_subtopics(pdf_path)
|
| 643 |
logger.info(f"Gemini returned subtopics: {subtopics}")
|
| 644 |
-
|
| 645 |
if pdf_path.startswith("http://") or pdf_path.startswith("https://"):
|
| 646 |
response = requests.get(pdf_path)
|
| 647 |
if response.status_code != 200:
|
|
@@ -653,11 +635,9 @@ class MineruNoTextProcessor:
|
|
| 653 |
with open(pdf_path, "rb") as f:
|
| 654 |
pdf_bytes = f.read()
|
| 655 |
logger.info("Loaded %d bytes from local file '%s'", len(pdf_bytes), pdf_path)
|
| 656 |
-
|
| 657 |
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
|
| 658 |
total_pages = doc.page_count
|
| 659 |
doc.close()
|
| 660 |
-
|
| 661 |
final_pages = set()
|
| 662 |
if not subtopics:
|
| 663 |
logger.warning("No subtopics found. Processing entire PDF as fallback.")
|
|
@@ -694,7 +674,6 @@ class MineruNoTextProcessor:
|
|
| 694 |
logger.warning("No valid pages after offset. Processing entire PDF.")
|
| 695 |
final_pages = set(range(total_pages))
|
| 696 |
logger.info(f"Processing pages (0-based): {sorted(final_pages)}")
|
| 697 |
-
|
| 698 |
subset_pdf_bytes = create_subset_pdf(pdf_bytes, sorted(final_pages))
|
| 699 |
dataset = PymuDocDataset(subset_pdf_bytes)
|
| 700 |
inference = doc_analyze(
|
|
@@ -706,34 +685,48 @@ class MineruNoTextProcessor:
|
|
| 706 |
table_enable=self.table_enable
|
| 707 |
)
|
| 708 |
logger.info("doc_analyze complete. Extracting images.")
|
|
|
|
| 709 |
if self.use_s3:
|
| 710 |
-
writer = S3ImageWriter(self.s3_writer,
|
| 711 |
-
md_prefix =
|
| 712 |
else:
|
| 713 |
writer = LocalImageWriter(self.output_folder, self.gemini_api_key)
|
| 714 |
md_prefix = "local-unique-prefix/"
|
| 715 |
pipe_result = inference.pipe_ocr_mode(writer, lang=self.language)
|
| 716 |
md_content = pipe_result.get_markdown(md_prefix)
|
| 717 |
final_markdown = writer.post_process(md_prefix, md_content)
|
| 718 |
-
|
| 719 |
-
|
| 720 |
-
|
| 721 |
-
|
| 722 |
-
|
| 723 |
-
|
| 724 |
-
|
| 725 |
-
|
| 726 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 727 |
finally:
|
| 728 |
self.cleanup_gpu()
|
| 729 |
|
| 730 |
if __name__ == "__main__":
|
| 731 |
input_pdf = "/home/user/app/input_output/wjec-gce-maths-spec-from-2017-e.pdf"
|
| 732 |
-
output_dir = "/home/user/app/
|
| 733 |
gemini_key = os.getenv("GEMINI_API_KEY", "AIzaSyDtoakpXa2pjJwcQB6TJ5QaXHNSA5JxcrU")
|
| 734 |
try:
|
| 735 |
processor = MineruNoTextProcessor(output_folder=output_dir, gemini_api_key=gemini_key)
|
| 736 |
-
|
| 737 |
logger.info("Processing completed successfully.")
|
| 738 |
except Exception as e:
|
| 739 |
logger.error(f"Processing failed: {e}")
|
|
|
|
| 77 |
)
|
| 78 |
|
| 79 |
def write(self, path: str, data: bytes) -> None:
|
| 80 |
+
"""Upload data to S3 using proper keyword arguments"""
|
| 81 |
+
try:
|
| 82 |
+
file_obj = BytesIO(data)
|
| 83 |
+
self.client.upload_fileobj(
|
| 84 |
+
file_obj,
|
| 85 |
+
self.bucket,
|
| 86 |
+
path
|
| 87 |
+
)
|
| 88 |
+
logger.info(f"Uploaded to S3: {path}")
|
| 89 |
+
except Exception as e:
|
| 90 |
+
logger.error(f"Failed to upload to S3: {str(e)}")
|
| 91 |
+
raise
|
| 92 |
|
|
|
|
| 93 |
def preprocess_image(image_data: bytes, max_dim: int = 600, quality: int = 60) -> bytes:
|
| 94 |
arr = np.frombuffer(image_data, np.uint8)
|
| 95 |
img = cv2.imdecode(arr, cv2.IMREAD_COLOR)
|
|
|
|
| 169 |
class S3ImageWriter(DataWriter):
|
| 170 |
def __init__(self, s3_writer: s3Writer, base_path: str, gemini_api_key: str):
|
| 171 |
self.s3_writer = s3_writer
|
| 172 |
+
# Use the provided base_path (which can be based on the PDF file name)
|
| 173 |
self.base_path = base_path if base_path.endswith("/") else base_path + "/"
|
| 174 |
self.gemini_api_key = gemini_api_key
|
| 175 |
self.descriptions = {}
|
|
|
|
| 213 |
|
| 214 |
md_content = await self._process_table_images_in_markdown(key, md_content)
|
| 215 |
final_lines = []
|
|
|
|
| 216 |
for line in md_content.split("\n"):
|
| 217 |
if re.match(r"^\!\[.*\]\(.*\)", line.strip()):
|
| 218 |
final_lines.append(line.strip())
|
|
|
|
| 219 |
return "\n".join(final_lines)
|
| 220 |
|
| 221 |
async def _process_table_images_in_markdown(self, key: str, md_content: str) -> str:
|
| 222 |
pat = r"!\[HAS TO BE PROCESSED - (two|three) column table\]\(([^)]+)\)"
|
| 223 |
matches = re.findall(pat, md_content, flags=re.IGNORECASE)
|
|
|
|
| 224 |
if not matches:
|
| 225 |
return md_content
|
|
|
|
| 226 |
for (col_type, s3_key) in matches:
|
| 227 |
logger.info(f"Processing table image: {s3_key}, columns={col_type}")
|
| 228 |
img_data = None
|
|
|
|
| 252 |
subtopic_threshold=0.2
|
| 253 |
)
|
| 254 |
row_boxes = extractor.process_image(temp_path)
|
|
|
|
| 255 |
snippet = ["**Extracted table cells:**"]
|
|
|
|
| 256 |
for i, row in enumerate(row_boxes):
|
| 257 |
for j, _ in enumerate(row):
|
| 258 |
cell_unique_key = f"{self.base_path}cells/{os.path.basename(s3_key).split('.')[0]}_row{i}_col{j}.jpg"
|
| 259 |
self.s3_writer.write(cell_unique_key, img_data)
|
| 260 |
snippet.append(f"")
|
| 261 |
new_snip = "\n".join(snippet)
|
|
|
|
| 262 |
old_line = f""
|
| 263 |
md_content = md_content.replace(old_line, new_snip)
|
|
|
|
| 264 |
except Exception as e:
|
| 265 |
logger.error(f"Error processing table image {s3_key}: {e}")
|
| 266 |
finally:
|
|
|
|
| 272 |
|
| 273 |
def post_process(self, key: str, md_content: str) -> str:
|
| 274 |
return asyncio.run(self.post_process_async(key, md_content))
|
| 275 |
+
|
| 276 |
class LocalImageWriter(DataWriter):
|
| 277 |
def __init__(self, output_folder: str, gemini_api_key: str):
|
| 278 |
self.output_folder = output_folder
|
|
|
|
| 280 |
self.descriptions = {}
|
| 281 |
self._img_count = 0
|
| 282 |
self.gemini_api_key = gemini_api_key
|
| 283 |
+
# New mapping to store extracted table cell image paths for testing.
|
| 284 |
+
self.extracted_tables = {}
|
| 285 |
|
| 286 |
def write(self, path: str, data: bytes) -> None:
|
| 287 |
self._img_count += 1
|
|
|
|
| 292 |
"table_classification": "NO_TABLE",
|
| 293 |
"final_alt": ""
|
| 294 |
}
|
| 295 |
+
# Also save the original image locally for testing.
|
| 296 |
+
image_path = os.path.join(self.output_folder, unique_id)
|
| 297 |
+
with open(image_path, "wb") as f:
|
| 298 |
+
f.write(data)
|
| 299 |
|
| 300 |
async def post_process_async(self, key: str, md_content: str) -> str:
|
| 301 |
logger.info("Classifying images to detect tables.")
|
| 302 |
tasks = []
|
| 303 |
for p, info in self.descriptions.items():
|
| 304 |
tasks.append((p, classify_image_async(info["data"], self.gemini_api_key)))
|
|
|
|
| 305 |
for p, task in tasks:
|
| 306 |
try:
|
| 307 |
classification = await task
|
|
|
|
| 309 |
except Exception as e:
|
| 310 |
logger.error(f"Table classification error: {e}")
|
| 311 |
self.descriptions[p]['table_classification'] = "NO_TABLE"
|
|
|
|
| 312 |
for p, info in self.descriptions.items():
|
| 313 |
cls = info['table_classification']
|
| 314 |
if cls == "TWO_COLUMN":
|
|
|
|
| 318 |
else:
|
| 319 |
info['final_alt'] = "NO_TABLE image"
|
| 320 |
md_content = md_content.replace(f"", f"![{info['final_alt']}]({info['relative_path']})")
|
|
|
|
| 321 |
md_content = self._process_table_images_in_markdown(md_content)
|
| 322 |
final_lines = []
|
|
|
|
| 323 |
for line in md_content.split("\n"):
|
| 324 |
if re.match(r"^\!\[.*\]\(.*\)", line.strip()):
|
| 325 |
final_lines.append(line.strip())
|
|
|
|
| 330 |
matches = re.findall(pat, md_content, flags=re.IGNORECASE)
|
| 331 |
if not matches:
|
| 332 |
return md_content
|
|
|
|
| 333 |
for (col_type, image_id) in matches:
|
| 334 |
logger.info(f"Processing table image => {image_id}, columns={col_type}")
|
| 335 |
temp_path = os.path.join(self.output_folder, image_id)
|
|
|
|
| 360 |
subtopic_threshold=0.2
|
| 361 |
)
|
| 362 |
row_boxes = extractor.process_image(temp_path)
|
|
|
|
| 363 |
out_folder = temp_path + "_rows"
|
| 364 |
os.makedirs(out_folder, exist_ok=True)
|
|
|
|
| 365 |
extractor.save_extracted_cells(temp_path, row_boxes, out_folder)
|
| 366 |
+
# List all extracted cell images relative to the output folder.
|
| 367 |
+
extracted_cells = []
|
| 368 |
+
for root, dirs, files in os.walk(out_folder):
|
| 369 |
+
for file in files:
|
| 370 |
+
rel_path = os.path.relpath(os.path.join(root, file), self.output_folder)
|
| 371 |
+
extracted_cells.append(rel_path)
|
| 372 |
+
# Save mapping for testing.
|
| 373 |
+
self.extracted_tables[image_id] = extracted_cells
|
| 374 |
snippet = ["**Extracted table cells:**"]
|
| 375 |
for i, row in enumerate(row_boxes):
|
| 376 |
row_dir = os.path.join(out_folder, f"row_{i}")
|
|
|
|
| 382 |
new_snip = "\n".join(snippet)
|
| 383 |
old_line = f""
|
| 384 |
md_content = md_content.replace(old_line, new_snip)
|
|
|
|
| 385 |
except Exception as e:
|
| 386 |
logger.error(f"Error processing table image {image_id}: {e}")
|
| 387 |
finally:
|
|
|
|
| 393 |
return asyncio.run(self.post_process_async(key, md_content))
|
| 394 |
|
| 395 |
class GeminiTopicExtractor:
|
| 396 |
+
def __init__(self, api_key: str = None, num_pages: int = 14):
|
| 397 |
self.api_key = api_key or os.getenv("GEMINI_API_KEY", "")
|
| 398 |
self.num_pages = num_pages
|
| 399 |
|
|
|
|
| 402 |
if not first_pages_text.strip():
|
| 403 |
logger.error("No text from first pages => cannot extract subtopics.")
|
| 404 |
return {}
|
|
|
|
| 405 |
prompt = f"""
|
| 406 |
You have the first pages of a PDF specification, including a table of contents.
|
|
|
|
| 407 |
Instructions:
|
| 408 |
1. Identify the 'Contents' section listing all topics, subtopics, and their corresponding pages.
|
| 409 |
2. Identify the major academic subtopics (common desired topic names "Paper X", "Theme X", "Content of X", "AS Unit X", "A2 Unit X", or similar headings).
|
|
|
|
| 414 |
"Subtopic B": [start_page, end_page]
|
| 415 |
}}
|
| 416 |
5. If you can't find any subtopics, return an empty JSON.
|
|
|
|
| 417 |
Important notes:
|
| 418 |
- The correct "end_page" must be the page number of the next topic or subtopic minus 1.
|
| 419 |
- The final output must be valid JSON only, with no extra text or code blocks.
|
|
|
|
| 420 |
Examples:
|
|
|
|
| 421 |
1. Given this table of contents:
|
|
|
|
| 422 |
1 Introduction – 2
|
| 423 |
Why choose Edexcel A Level Mathematics? - 2
|
| 424 |
Supporting you in planning and implementing this qualification - 3
|
|
|
|
| 439 |
Appendix 6: Transferable skills – 64
|
| 440 |
Appendix 7: Level 3 Extended Project qualification – 65
|
| 441 |
Appendix 8: Codes – 67
|
|
|
|
| 442 |
The correct output should be:
|
|
|
|
| 443 |
{{
|
| 444 |
"Paper 1 and Paper 2: Pure Mathematics": [11, 29],
|
| 445 |
"Paper 3: Statistics and Mechanics": [30, 42]
|
| 446 |
}}
|
|
|
|
| 447 |
2. Given this table of contents:
|
|
|
|
| 448 |
Qualification at a glance – 1
|
| 449 |
Assessment Objectives and weightings - 4
|
| 450 |
Knowledge, skills and understanding – 5
|
|
|
|
| 473 |
Appendix 3: Quantitative skills – 59
|
| 474 |
Appendix 4: Codes – 61
|
| 475 |
Appendix 5: Index – 63
|
|
|
|
| 476 |
The correct output should be:
|
|
|
|
| 477 |
{{
|
| 478 |
"Theme 1: Introduction to markets and market failure": [5, 10],
|
| 479 |
"Theme 2: The UK economy – performance and policies": [11, 20],
|
| 480 |
"Theme 3: Business behaviour and the labour market": [21, 28],
|
| 481 |
"Theme 4: A global perspective": [29, 38]
|
| 482 |
}}
|
|
|
|
| 483 |
3. You might also see sections like:
|
|
|
|
| 484 |
2.1 AS Unit 1 11
|
| 485 |
2.2 AS Unit 2 18
|
| 486 |
2.3 A2 Unit 3 24
|
| 487 |
2.4 A2 Unit 4 31
|
|
|
|
| 488 |
In that scenario, your output might look like:
|
|
|
|
| 489 |
{{
|
| 490 |
"2.1 AS Unit 1": [11, 17],
|
| 491 |
"2.2 AS Unit 2": [18, 23],
|
| 492 |
"2.3 A2 Unit 3": [24, 30],
|
| 493 |
"2.4 A2 Unit 4": [31, 35]
|
| 494 |
}}
|
|
|
|
| 495 |
4. Another example might list subtopics:
|
|
|
|
| 496 |
3.1 Overarching themes 11
|
| 497 |
3.2 A: Proof 12
|
| 498 |
3.3 B: Algebra and functions 13
|
|
|
|
| 514 |
3.19 R: Forces and Newton’s laws 24
|
| 515 |
3.20 S: Moments 25
|
| 516 |
3.21 Use of data in statistics 26
|
|
|
|
| 517 |
Here the correct output might look like:
|
|
|
|
| 518 |
{{
|
| 519 |
"A: Proof": [12, 12],
|
| 520 |
"B: Algebra and functions": [13, 13],
|
| 521 |
...
|
| 522 |
}}
|
|
|
|
| 523 |
Now, extract topics from this text:
|
| 524 |
{first_pages_text}
|
| 525 |
"""
|
|
|
|
| 527 |
if _GEMINI_CLIENT is None:
|
| 528 |
_GEMINI_CLIENT = genai.Client(api_key=self.api_key)
|
| 529 |
client = _GEMINI_CLIENT
|
|
|
|
| 530 |
try:
|
| 531 |
response = client.models.generate_content(
|
| 532 |
model="gemini-2.0-flash",
|
|
|
|
| 594 |
self.language = "en"
|
| 595 |
self.subtopic_extractor = GeminiTopicExtractor(api_key=gemini_api_key, num_pages=10)
|
| 596 |
self.gemini_api_key = gemini_api_key or os.getenv("GEMINI_API_KEY", "")
|
| 597 |
+
# For testing via __main__, force local saving.
|
| 598 |
+
if __name__ == "__main__":
|
| 599 |
+
logger.info("Running in test mode: using local image writer.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 600 |
self.use_s3 = False
|
| 601 |
+
else:
|
| 602 |
+
if (os.getenv("S3_ACCESS_KEY") and os.getenv("S3_SECRET_KEY") and
|
| 603 |
+
os.getenv("S3_BUCKET_NAME") and os.getenv("S3_ENDPOINT")):
|
| 604 |
+
self.use_s3 = True
|
| 605 |
+
self.s3_writer = s3Writer(
|
| 606 |
+
ak=os.getenv("S3_ACCESS_KEY"),
|
| 607 |
+
sk=os.getenv("S3_SECRET_KEY"),
|
| 608 |
+
bucket=os.getenv("S3_BUCKET_NAME"),
|
| 609 |
+
endpoint_url=os.getenv("S3_ENDPOINT")
|
| 610 |
+
)
|
| 611 |
+
else:
|
| 612 |
+
self.use_s3 = False
|
| 613 |
|
| 614 |
def cleanup_gpu(self):
|
| 615 |
try:
|
|
|
|
| 619 |
except Exception as e:
|
| 620 |
logger.error(f"Error during GPU cleanup: {e}")
|
| 621 |
|
| 622 |
+
def process(self, pdf_path: str) -> Dict[str, Any]:
|
| 623 |
logger.info(f"Processing PDF: {pdf_path}")
|
| 624 |
try:
|
| 625 |
subtopics = self.subtopic_extractor.extract_subtopics(pdf_path)
|
| 626 |
logger.info(f"Gemini returned subtopics: {subtopics}")
|
|
|
|
| 627 |
if pdf_path.startswith("http://") or pdf_path.startswith("https://"):
|
| 628 |
response = requests.get(pdf_path)
|
| 629 |
if response.status_code != 200:
|
|
|
|
| 635 |
with open(pdf_path, "rb") as f:
|
| 636 |
pdf_bytes = f.read()
|
| 637 |
logger.info("Loaded %d bytes from local file '%s'", len(pdf_bytes), pdf_path)
|
|
|
|
| 638 |
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
|
| 639 |
total_pages = doc.page_count
|
| 640 |
doc.close()
|
|
|
|
| 641 |
final_pages = set()
|
| 642 |
if not subtopics:
|
| 643 |
logger.warning("No subtopics found. Processing entire PDF as fallback.")
|
|
|
|
| 674 |
logger.warning("No valid pages after offset. Processing entire PDF.")
|
| 675 |
final_pages = set(range(total_pages))
|
| 676 |
logger.info(f"Processing pages (0-based): {sorted(final_pages)}")
|
|
|
|
| 677 |
subset_pdf_bytes = create_subset_pdf(pdf_bytes, sorted(final_pages))
|
| 678 |
dataset = PymuDocDataset(subset_pdf_bytes)
|
| 679 |
inference = doc_analyze(
|
|
|
|
| 685 |
table_enable=self.table_enable
|
| 686 |
)
|
| 687 |
logger.info("doc_analyze complete. Extracting images.")
|
| 688 |
+
key = os.path.splitext(os.path.basename(pdf_path))[0]
|
| 689 |
if self.use_s3:
|
| 690 |
+
writer = S3ImageWriter(self.s3_writer, f"{key}/", self.gemini_api_key)
|
| 691 |
+
md_prefix = f"{key}/"
|
| 692 |
else:
|
| 693 |
writer = LocalImageWriter(self.output_folder, self.gemini_api_key)
|
| 694 |
md_prefix = "local-unique-prefix/"
|
| 695 |
pipe_result = inference.pipe_ocr_mode(writer, lang=self.language)
|
| 696 |
md_content = pipe_result.get_markdown(md_prefix)
|
| 697 |
final_markdown = writer.post_process(md_prefix, md_content)
|
| 698 |
+
|
| 699 |
+
output_json = {
|
| 700 |
+
"subtopics": subtopics
|
| 701 |
+
}
|
| 702 |
+
if not self.use_s3 and isinstance(writer, LocalImageWriter):
|
| 703 |
+
local_images = {k: v["relative_path"] for k, v in writer.descriptions.items()}
|
| 704 |
+
tables_extracted = writer.extracted_tables
|
| 705 |
+
output_json["local_images"] = local_images
|
| 706 |
+
output_json["tables_extracted"] = tables_extracted
|
| 707 |
+
# Save output in JSON format.
|
| 708 |
+
out_json = json.dumps(output_json, indent=2)
|
| 709 |
+
# Save JSON locally.
|
| 710 |
+
out_path = os.path.join(self.output_folder, "final_output.json")
|
| 711 |
+
with open(out_path, "w", encoding="utf-8") as f:
|
| 712 |
+
f.write(out_json)
|
| 713 |
+
logger.info(f"Final JSON saved locally at {out_path}")
|
| 714 |
+
# Also save a local copy for testing.
|
| 715 |
+
local_md_path = os.path.join(self.output_folder, "final_output_local.json")
|
| 716 |
+
with open(local_md_path, "w", encoding="utf-8") as f:
|
| 717 |
+
f.write(out_json)
|
| 718 |
+
logger.info(f"Final JSON saved locally at {local_md_path}")
|
| 719 |
+
return output_json
|
| 720 |
finally:
|
| 721 |
self.cleanup_gpu()
|
| 722 |
|
| 723 |
if __name__ == "__main__":
|
| 724 |
input_pdf = "/home/user/app/input_output/wjec-gce-maths-spec-from-2017-e.pdf"
|
| 725 |
+
output_dir = "/home/user/app/wje"
|
| 726 |
gemini_key = os.getenv("GEMINI_API_KEY", "AIzaSyDtoakpXa2pjJwcQB6TJ5QaXHNSA5JxcrU")
|
| 727 |
try:
|
| 728 |
processor = MineruNoTextProcessor(output_folder=output_dir, gemini_api_key=gemini_key)
|
| 729 |
+
result_json = processor.process(input_pdf)
|
| 730 |
logger.info("Processing completed successfully.")
|
| 731 |
except Exception as e:
|
| 732 |
logger.error(f"Processing failed: {e}")
|