fix topic extraction prompt
Browse files- topic_extraction.log +89 -0
- topic_extraction.py +8 -5
topic_extraction.log
CHANGED
|
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2025-02-28 15:26:46,260 [INFO] __main__ - Processing PDF: /home/user/app/input_output/AQA-7357-SP-2017.PDF
|
| 2 |
+
2025-02-28 15:26:46,367 [ERROR] __main__ - Gemini subtopic extraction error: 'NoneType' object has no attribute 'models'
|
| 3 |
+
2025-02-28 15:26:46,368 [INFO] __main__ - Gemini returned subtopics: {}
|
| 4 |
+
2025-02-28 15:26:46,368 [INFO] __main__ - Loaded 1035984 bytes from local file '/home/user/app/input_output/AQA-7357-SP-2017.PDF'
|
| 5 |
+
2025-02-28 15:26:46,374 [WARNING] __main__ - No subtopics found. Processing entire PDF as fallback.
|
| 6 |
+
2025-02-28 15:26:46,375 [INFO] __main__ - Processing pages (0-based): [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]
|
| 7 |
+
2025-02-28 15:26:47,737 [INFO] __main__ - GPU memory cleaned up.
|
| 8 |
+
2025-02-28 15:27:16,530 [INFO] __main__ - Processing PDF: /home/user/app/input_output/ocr-specification-economics.pdf
|
| 9 |
+
2025-02-28 15:27:16,579 [ERROR] __main__ - Gemini subtopic extraction error: 'NoneType' object has no attribute 'models'
|
| 10 |
+
2025-02-28 15:27:16,579 [INFO] __main__ - Gemini returned subtopics: {}
|
| 11 |
+
2025-02-28 15:27:16,585 [INFO] __main__ - Loaded 9752567 bytes from local file '/home/user/app/input_output/ocr-specification-economics.pdf'
|
| 12 |
+
2025-02-28 15:27:16,588 [WARNING] __main__ - No subtopics found. Processing entire PDF as fallback.
|
| 13 |
+
2025-02-28 15:27:16,588 [INFO] __main__ - Processing pages (0-based): [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43]
|
| 14 |
+
2025-02-28 15:27:17,564 [INFO] __main__ - GPU memory cleaned up.
|
| 15 |
+
2025-02-28 15:31:39,420 [INFO] __main__ - Processing PDF: /home/user/app/input_output/AQA-7357-SP-2017.PDF
|
| 16 |
+
2025-02-28 15:31:41,827 [INFO] __main__ - Gemini returned subtopics: {'A: Proof': [12, 12], 'B: Algebra and functions': [13, 13], 'C: Coordinate geometry in the ( x , y ) plane': [14, 14], 'D: Sequences and series': [15, 15], 'E: Trigonometry': [16, 16], 'F: Exponentials and logarithms': [17, 17], 'G: Differentiation': [18, 18], 'H: Integration': [19, 19], 'I: Numerical methods': [20, 20], 'J: Vectors': [20, 20], 'K: Statistical sampling': [21, 21], 'L: Data presentation and interpretation': [21, 21], 'M: Probability': [22, 22], 'N: Statistical distributions': [23, 22], 'O: Statistical hypothesis testing': [23, 23], 'P: Quantities and units in mechanics': [24, 23], 'Q: Kinematics': [24, 23], 'R: Forces and Newton’s laws': [24, 24], 'S: Moments': [25, 25]}
|
| 17 |
+
2025-02-28 15:31:41,828 [INFO] __main__ - Loaded 1035984 bytes from local file '/home/user/app/input_output/AQA-7357-SP-2017.PDF'
|
| 18 |
+
2025-02-28 15:31:41,986 [INFO] __main__ - Occurrences of subtopic 'A: Proof': [2, 6, 7, 11]
|
| 19 |
+
2025-02-28 15:31:42,138 [INFO] __main__ - Occurrences of subtopic 'B: Algebra and functions': [2, 6, 7, 12]
|
| 20 |
+
2025-02-28 15:31:42,297 [INFO] __main__ - Occurrences of subtopic 'C: Coordinate geometry in the ( x , y ) plane': [2, 6, 13]
|
| 21 |
+
2025-02-28 15:31:42,445 [INFO] __main__ - Occurrences of subtopic 'D: Sequences and series': [2, 6, 7, 14]
|
| 22 |
+
2025-02-28 15:31:42,594 [INFO] __main__ - Occurrences of subtopic 'E: Trigonometry': [2, 6, 7, 15]
|
| 23 |
+
2025-02-28 15:31:42,788 [INFO] __main__ - Occurrences of subtopic 'F: Exponentials and logarithms': [2, 6, 7, 16]
|
| 24 |
+
2025-02-28 15:31:42,936 [INFO] __main__ - Occurrences of subtopic 'G: Differentiation': [2, 6, 7, 17]
|
| 25 |
+
2025-02-28 15:31:43,087 [INFO] __main__ - Occurrences of subtopic 'H: Integration': [2, 6, 7, 18]
|
| 26 |
+
2025-02-28 15:31:43,238 [INFO] __main__ - Occurrences of subtopic 'I: Numerical methods': [2, 6, 7, 19]
|
| 27 |
+
2025-02-28 15:31:43,558 [INFO] __main__ - GPU memory cleaned up.
|
| 28 |
+
2025-02-28 15:32:03,728 [INFO] __main__ - Processing PDF: /home/user/app/input_output/AQA-7357-SP-2017.PDF
|
| 29 |
+
2025-02-28 15:32:06,129 [INFO] __main__ - Gemini returned subtopics: {'A: Proof': [12, 12], 'B: Algebra and functions': [13, 13], 'C: Coordinate geometry in the ( x , y ) plane': [14, 14], 'D: Sequences and series': [15, 15], 'E: Trigonometry': [16, 16], 'F: Exponentials and logarithms': [17, 17], 'G: Differentiation': [18, 18], 'H: Integration': [19, 19], 'I: Numerical methods': [20, 19], 'J: Vectors': [20, 20], 'K: Statistical sampling': [21, 20], 'L: Data presentation and interpretation': [21, 21], 'M: Probability': [22, 22], 'N: Statistical distributions': [23, 22], 'O: Statistical hypothesis testing': [23, 23], 'P: Quantities and units in mechanics': [24, 23], 'Q: Kinematics': [24, 23], 'R: Forces and Newton’s laws': [24, 24], 'S: Moments': [25, 25]}
|
| 30 |
+
2025-02-28 15:32:06,129 [INFO] __main__ - Loaded 1035984 bytes from local file '/home/user/app/input_output/AQA-7357-SP-2017.PDF'
|
| 31 |
+
2025-02-28 15:32:06,284 [INFO] __main__ - Occurrences of subtopic 'A: Proof': [2, 6, 7, 11]
|
| 32 |
+
2025-02-28 15:32:06,432 [INFO] __main__ - Occurrences of subtopic 'B: Algebra and functions': [2, 6, 7, 12]
|
| 33 |
+
2025-02-28 15:32:06,583 [INFO] __main__ - Occurrences of subtopic 'C: Coordinate geometry in the ( x , y ) plane': [2, 6, 13]
|
| 34 |
+
2025-02-28 15:32:06,732 [INFO] __main__ - Occurrences of subtopic 'D: Sequences and series': [2, 6, 7, 14]
|
| 35 |
+
2025-02-28 15:32:06,882 [INFO] __main__ - Occurrences of subtopic 'E: Trigonometry': [2, 6, 7, 15]
|
| 36 |
+
2025-02-28 15:32:07,034 [INFO] __main__ - Occurrences of subtopic 'F: Exponentials and logarithms': [2, 6, 7, 16]
|
| 37 |
+
2025-02-28 15:32:07,184 [INFO] __main__ - Occurrences of subtopic 'G: Differentiation': [2, 6, 7, 17]
|
| 38 |
+
2025-02-28 15:32:07,332 [INFO] __main__ - Occurrences of subtopic 'H: Integration': [2, 6, 7, 18]
|
| 39 |
+
2025-02-28 15:32:07,333 [WARNING] __main__ - Skipping subtopic 'I: Numerical methods' => start > end [20, 19]
|
| 40 |
+
2025-02-28 15:32:07,481 [INFO] __main__ - Occurrences of subtopic 'J: Vectors': [2, 6, 7, 19]
|
| 41 |
+
2025-02-28 15:32:07,481 [WARNING] __main__ - Skipping subtopic 'K: Statistical sampling' => start > end [21, 20]
|
| 42 |
+
2025-02-28 15:32:07,632 [INFO] __main__ - Occurrences of subtopic 'L: Data presentation and interpretation': [2, 6, 8, 20]
|
| 43 |
+
2025-02-28 15:32:07,780 [INFO] __main__ - Occurrences of subtopic 'M: Probability': [2, 6, 8, 21]
|
| 44 |
+
2025-02-28 15:32:07,781 [WARNING] __main__ - Skipping subtopic 'N: Statistical distributions' => start > end [23, 22]
|
| 45 |
+
2025-02-28 15:32:07,971 [INFO] __main__ - Occurrences of subtopic 'O: Statistical hypothesis testing': [2, 6, 8, 22]
|
| 46 |
+
2025-02-28 15:32:07,971 [WARNING] __main__ - Skipping subtopic 'P: Quantities and units in mechanics' => start > end [24, 23]
|
| 47 |
+
2025-02-28 15:32:07,971 [WARNING] __main__ - Skipping subtopic 'Q: Kinematics' => start > end [24, 23]
|
| 48 |
+
2025-02-28 15:32:08,132 [INFO] __main__ - Occurrences of subtopic 'R: Forces and Newton’s laws': [2, 6, 7, 23]
|
| 49 |
+
2025-02-28 15:32:08,285 [INFO] __main__ - Occurrences of subtopic 'S: Moments': [2, 6, 7, 24]
|
| 50 |
+
2025-02-28 15:32:08,285 [INFO] __main__ - Processing pages (0-based): [11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24]
|
| 51 |
+
2025-02-28 15:32:12,066 [INFO] __main__ - GPU memory cleaned up.
|
| 52 |
+
2025-02-28 15:33:29,098 [INFO] __main__ - Processing PDF: /home/user/app/input_output/pearson-as-mathematics-specification.pdf
|
| 53 |
+
2025-02-28 15:33:29,851 [INFO] __main__ - Gemini returned subtopics: {'Paper 1: Pure Mathematics': [11, 19], 'Paper 2: Statistics and Mechanics': [20, 27]}
|
| 54 |
+
2025-02-28 15:33:29,851 [INFO] __main__ - Loaded 1039025 bytes from local file '/home/user/app/input_output/pearson-as-mathematics-specification.pdf'
|
| 55 |
+
2025-02-28 15:33:30,023 [INFO] __main__ - Occurrences of subtopic 'Paper 1: Pure Mathematics': [4, 8, 14, 30]
|
| 56 |
+
2025-02-28 15:33:30,184 [INFO] __main__ - Occurrences of subtopic 'Paper 2: Statistics and Mechanics': [4, 9, 13, 23, 30]
|
| 57 |
+
2025-02-28 15:33:30,184 [INFO] __main__ - Processing pages (0-based): [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30]
|
| 58 |
+
2025-02-28 15:33:33,670 [INFO] __main__ - GPU memory cleaned up.
|
| 59 |
+
2025-02-28 15:34:00,961 [INFO] __main__ - Processing PDF: /home/user/app/input_output/wjec-gce-maths-spec-from-2017-e.pdf
|
| 60 |
+
2025-02-28 15:34:01,772 [INFO] __main__ - Gemini returned subtopics: {'AS Unit 1': [11, 17], 'AS Unit 2': [18, 23], 'A2 Unit 3': [24, 30], 'A2 Unit 4': [31, 35]}
|
| 61 |
+
2025-02-28 15:34:01,773 [INFO] __main__ - Loaded 1510568 bytes from local file '/home/user/app/input_output/wjec-gce-maths-spec-from-2017-e.pdf'
|
| 62 |
+
2025-02-28 15:34:01,961 [INFO] __main__ - Occurrences of subtopic 'AS Unit 1': [3, 4, 38, 40]
|
| 63 |
+
2025-02-28 15:34:02,141 [INFO] __main__ - Occurrences of subtopic 'AS Unit 2': [3, 4, 38, 40]
|
| 64 |
+
2025-02-28 15:34:02,320 [INFO] __main__ - Occurrences of subtopic 'A2 Unit 3': [3, 5, 38, 40]
|
| 65 |
+
2025-02-28 15:34:02,497 [INFO] __main__ - Occurrences of subtopic 'A2 Unit 4': [3, 5, 38, 40]
|
| 66 |
+
2025-02-28 15:34:02,497 [INFO] __main__ - Processing pages (0-based): [38, 39, 40, 41, 42, 43, 44]
|
| 67 |
+
2025-02-28 15:34:03,341 [INFO] __main__ - GPU memory cleaned up.
|
| 68 |
+
2025-02-28 15:34:26,135 [INFO] __main__ - Processing PDF: /home/user/app/input_output/wjec-gce-maths-spec-from-2017-e.pdf
|
| 69 |
+
2025-02-28 15:34:27,000 [INFO] __main__ - Gemini returned subtopics: {'AS Unit 1': [11, 17], 'AS Unit 2': [18, 23], 'A2 Unit 3': [24, 30], 'A2 Unit 4': [31, 35]}
|
| 70 |
+
2025-02-28 15:34:27,001 [INFO] __main__ - Loaded 1510568 bytes from local file '/home/user/app/input_output/wjec-gce-maths-spec-from-2017-e.pdf'
|
| 71 |
+
2025-02-28 15:34:27,187 [INFO] __main__ - Occurrences of subtopic 'AS Unit 1': [3, 4, 38, 40]
|
| 72 |
+
2025-02-28 15:34:27,374 [INFO] __main__ - Occurrences of subtopic 'AS Unit 2': [3, 4, 38, 40]
|
| 73 |
+
2025-02-28 15:34:27,564 [INFO] __main__ - Occurrences of subtopic 'A2 Unit 3': [3, 5, 38, 40]
|
| 74 |
+
2025-02-28 15:34:27,747 [INFO] __main__ - Occurrences of subtopic 'A2 Unit 4': [3, 5, 38, 40]
|
| 75 |
+
2025-02-28 15:34:27,748 [INFO] __main__ - Processing pages (0-based): [38, 39, 40, 41, 42, 43, 44]
|
| 76 |
+
2025-02-28 15:34:33,618 [INFO] __main__ - GPU memory cleaned up.
|
| 77 |
+
2025-02-28 15:36:35,445 [INFO] __main__ - Processing PDF: /home/user/app/input_output/wjec-gce-maths-spec-from-2017-e.pdf
|
| 78 |
+
2025-02-28 15:36:36,512 [INFO] __main__ - Gemini returned subtopics: {'2.1 AS Unit 1': [11, 17], '2.2 AS Unit 2': [18, 23], '2.3 A2 Unit 3': [24, 30], '2.4 A2 Unit 4': [31, 35]}
|
| 79 |
+
2025-02-28 15:36:36,513 [INFO] __main__ - Loaded 1510568 bytes from local file '/home/user/app/input_output/wjec-gce-maths-spec-from-2017-e.pdf'
|
| 80 |
+
2025-02-28 15:36:36,820 [INFO] __main__ - Occurrences of subtopic '2.1 AS Unit 1': [3]
|
| 81 |
+
2025-02-28 15:36:36,820 [WARNING] __main__ - No suitable occurrence for '2.1 AS Unit 1'. Using page 3.
|
| 82 |
+
2025-02-28 15:36:37,023 [INFO] __main__ - Occurrences of subtopic '2.2 AS Unit 2': [3]
|
| 83 |
+
2025-02-28 15:36:37,024 [WARNING] __main__ - No suitable occurrence for '2.2 AS Unit 2'. Using page 3.
|
| 84 |
+
2025-02-28 15:36:37,207 [INFO] __main__ - Occurrences of subtopic '2.3 A2 Unit 3': [3]
|
| 85 |
+
2025-02-28 15:36:37,207 [WARNING] __main__ - No suitable occurrence for '2.3 A2 Unit 3'. Using page 3.
|
| 86 |
+
2025-02-28 15:36:37,389 [INFO] __main__ - Occurrences of subtopic '2.4 A2 Unit 4': [3]
|
| 87 |
+
2025-02-28 15:36:37,389 [WARNING] __main__ - No suitable occurrence for '2.4 A2 Unit 4'. Using page 3.
|
| 88 |
+
2025-02-28 15:36:37,390 [INFO] __main__ - Processing pages (0-based): [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34]
|
| 89 |
+
2025-02-28 15:36:38,518 [INFO] __main__ - GPU memory cleaned up.
|
topic_extraction.py
CHANGED
|
@@ -500,10 +500,10 @@ The correct output should be:
|
|
| 500 |
In that scenario, your output might look like:
|
| 501 |
|
| 502 |
{{
|
| 503 |
-
"AS Unit 1": [11, 17],
|
| 504 |
-
"AS Unit 2": [18, 23],
|
| 505 |
-
"A2 Unit 3": [24, 30],
|
| 506 |
-
"A2 Unit 4": [31, 35]
|
| 507 |
}}
|
| 508 |
|
| 509 |
4. Another example might list subtopics:
|
|
@@ -542,7 +542,10 @@ Now, extract topics from this text:
|
|
| 542 |
{first_pages_text}
|
| 543 |
"""
|
| 544 |
global _GEMINI_CLIENT
|
|
|
|
|
|
|
| 545 |
client = _GEMINI_CLIENT
|
|
|
|
| 546 |
try:
|
| 547 |
response = client.models.generate_content(
|
| 548 |
model="gemini-2.0-flash",
|
|
@@ -725,7 +728,7 @@ class MineruNoTextProcessor:
|
|
| 725 |
self.cleanup_gpu()
|
| 726 |
|
| 727 |
if __name__ == "__main__":
|
| 728 |
-
input_pdf = "/home/user/app/input_output/
|
| 729 |
output_dir = "/home/user/app/outputs"
|
| 730 |
gemini_key = os.getenv("GEMINI_API_KEY", "AIzaSyDtoakpXa2pjJwcQB6TJ5QaXHNSA5JxcrU")
|
| 731 |
try:
|
|
|
|
| 500 |
In that scenario, your output might look like:
|
| 501 |
|
| 502 |
{{
|
| 503 |
+
"2.1 AS Unit 1": [11, 17],
|
| 504 |
+
"2.2 AS Unit 2": [18, 23],
|
| 505 |
+
"2.3 A2 Unit 3": [24, 30],
|
| 506 |
+
"2.4 A2 Unit 4": [31, 35]
|
| 507 |
}}
|
| 508 |
|
| 509 |
4. Another example might list subtopics:
|
|
|
|
| 542 |
{first_pages_text}
|
| 543 |
"""
|
| 544 |
global _GEMINI_CLIENT
|
| 545 |
+
if _GEMINI_CLIENT is None:
|
| 546 |
+
_GEMINI_CLIENT = genai.Client(api_key=self.api_key)
|
| 547 |
client = _GEMINI_CLIENT
|
| 548 |
+
|
| 549 |
try:
|
| 550 |
response = client.models.generate_content(
|
| 551 |
model="gemini-2.0-flash",
|
|
|
|
| 728 |
self.cleanup_gpu()
|
| 729 |
|
| 730 |
if __name__ == "__main__":
|
| 731 |
+
input_pdf = "/home/user/app/input_output/wjec-gce-maths-spec-from-2017-e.pdf"
|
| 732 |
output_dir = "/home/user/app/outputs"
|
| 733 |
gemini_key = os.getenv("GEMINI_API_KEY", "AIzaSyDtoakpXa2pjJwcQB6TJ5QaXHNSA5JxcrU")
|
| 734 |
try:
|