Changed image processing
Browse files- mineru_single.py +19 -2
- model_classification/svm_model.joblib +2 -2
mineru_single.py
CHANGED
|
@@ -98,6 +98,8 @@ class Processor:
|
|
| 98 |
finally:
|
| 99 |
# GPU memory is cleaned up after each processing.
|
| 100 |
self.cleanup_gpu()
|
|
|
|
|
|
|
| 101 |
class s3Writer:
|
| 102 |
def __init__(self, ak: str, sk: str, bucket: str, endpoint_url: str):
|
| 103 |
self.bucket = bucket
|
|
@@ -123,6 +125,7 @@ class s3Writer:
|
|
| 123 |
logger.error(f"Failed to upload to S3: {str(e)}")
|
| 124 |
raise
|
| 125 |
|
|
|
|
| 126 |
class ImageWriter(DataWriter):
|
| 127 |
"""
|
| 128 |
Receives each extracted image. Classifies it, uploads if relevant, or flags
|
|
@@ -177,7 +180,8 @@ class ImageWriter(DataWriter):
|
|
| 177 |
full_path = info['full_path']
|
| 178 |
md_content = md_content.replace(f"", f"")
|
| 179 |
return md_content
|
| 180 |
-
|
|
|
|
| 181 |
def call_gemini_for_image_description(image_data: bytes) -> str:
|
| 182 |
"""Convert image bytes to Gemini-compatible format and get description"""
|
| 183 |
from google import genai
|
|
@@ -206,6 +210,10 @@ def call_gemini_for_image_description(image_data: bytes) -> str:
|
|
| 206 |
If there are not enough data, consider information from the surrounding context.
|
| 207 |
Additionally, if the image contains a truncated part, you must describe it and mark as a
|
| 208 |
part of some another image that goes before or after current image.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 209 |
"""},
|
| 210 |
{
|
| 211 |
"inline_data": {
|
|
@@ -225,8 +233,17 @@ def call_gemini_for_image_description(image_data: bytes) -> str:
|
|
| 225 |
except Exception as e:
|
| 226 |
logger.error(f"Error getting image description: {str(e)}")
|
| 227 |
return ("error", "Error describing image", None)
|
|
|
|
|
|
|
| 228 |
if __name__ == "__main__":
|
| 229 |
processor = Processor()
|
| 230 |
single_url = "https://quextro-resources.s3.eu-west-2.amazonaws.com/1739967958667-643657-mark-scheme-computer-principles.pdf?response-content-disposition=inline&X-Amz-Content-Sha256=UNSIGNED-PAYLOAD&X-Amz-Security-Token=IQoJb3JpZ2luX2VjEJT%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FwEaCWV1LXdlc3QtMiJGMEQCIARfSyuot0h2RNrcqVQkc2T%2B1fJZ64NfjmkmAFgCkTG6AiArmbJDAUr7T85HdqAT2RbyLhmiIgpSo3ci4%2FUtSap2wCrUAwi8%2F%2F%2F%2F%2F%2F%2F%2F%2F%2F8BEAAaDDUwOTM5OTYxODAzMCIMkfFm%2FgBrHsH1qh59KqgDjfZd1%2BKGzxkn7JorfQ07dL%2BL5fjCA6kmNAzCnCjDpTLnNjBfB1vnO2ZLvtC8RNvnaewY6tFWUfl39dC62ldnfajHeFmxkZqBcbDf3oOGnuO2PIvBgb5%2BvppVDkYjWz7vv5TzpgC2sVzjA38QMwxAnausYWDgspap7qjlfoLJUiBOq9SIMZyKVsfeAf4OiUl0TDc2nheqvNXOJy9TPh94KWbBT35vP3fU9A7ZdF4sElm4nVZMnOPdbR7%2Ba6F57nPLZvUaLZC5Nb011ef6%2BhAxr9yeONh5MAoTGUH2qzedDmN%2FbKannddBy%2FNIaP%2BhF7lWUkKemQrM5vajwU6k2Q45pLruKWRkjtrWxdmkQE4zb67ETj5eGL%2BlPPj%2BPtQWzF7UaoWPUH4tGBZ%2Bqdu479rU1ZSg%2B15lR%2F8SAgP%2BydATGwyRtXEvMRJZIiUems8i6ehxWC%2FscY2%2FtCk9OREKhLwOEEdJDAR4vqt68lnnvVomHrVjwNQvyP9A4V8Ct%2B0SjxP%2F86kJnX3o%2FVEoFT44JWICuMuf8kwoelUbZGPl6SaftGsRSUvoy7PV5TCN3du9BjrlAjKhLpjsCwgp1rJ8cPBFcUgOmL3iXrtHs3FhDLljxbXRZ%2FadHkxAlzf%2BXym%2BFBnhdCkDfmWcMEH3GAOFfv%2FlE5SsZMO1JoXbzQlO3OX6nrUacj7LF7ZoO8TYMVoTyEZSLEABNOU7KCILaFeDGRDJ8Ia5I3jnXvOVouFn2VnhykCuWPTunjkMEQBiHa3mbZP0mVcSviujHXatN11INiR%2BPwAN5oxKXeT25B%2FCCI3wib5Av2tzp8zuw8joib5PWNXOYfRgMR7R0Sj%2FjW5SxWr%2BTD9TAD3%2Fqj5pj3Oo13dNGdv5RwGqk1iHd8okpkFYlxEmXD2tTanpxX8ON1%2FLHz%2BNEUJDOogx8TLw5I6mkVs3zjoMhhwn2%2BWrlnNa%2F3i9lAGyLY6Ps4U23Hv7b4gpH4%2BeJN72Z95hrNtcumq4uuf0pRoJPQ9pjiZttjeDwNZzb7d3XuiEQeOgK8rpTeEgduxhdJOOLwZGrg%3D%3D&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=ASIAXNGUVKHXFLYKHBHD%2F20250220%2Feu-west-2%2Fs3%2Faws4_request&X-Amz-Date=20250220T111935Z&X-Amz-Expires=10800&X-Amz-SignedHeaders=host&X-Amz-Signature=64aa008fdafe72f1a693078156451c0f6f702e89e546954d6b3d61abf9f73ec8"
|
| 231 |
markdown_result = processor.process(single_url, key="1234323")
|
| 232 |
-
print("Single file Markdown:\n", markdown_result)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 98 |
finally:
|
| 99 |
# GPU memory is cleaned up after each processing.
|
| 100 |
self.cleanup_gpu()
|
| 101 |
+
|
| 102 |
+
|
| 103 |
class s3Writer:
|
| 104 |
def __init__(self, ak: str, sk: str, bucket: str, endpoint_url: str):
|
| 105 |
self.bucket = bucket
|
|
|
|
| 125 |
logger.error(f"Failed to upload to S3: {str(e)}")
|
| 126 |
raise
|
| 127 |
|
| 128 |
+
|
| 129 |
class ImageWriter(DataWriter):
|
| 130 |
"""
|
| 131 |
Receives each extracted image. Classifies it, uploads if relevant, or flags
|
|
|
|
| 180 |
full_path = info['full_path']
|
| 181 |
md_content = md_content.replace(f"", f"")
|
| 182 |
return md_content
|
| 183 |
+
|
| 184 |
+
|
| 185 |
def call_gemini_for_image_description(image_data: bytes) -> str:
|
| 186 |
"""Convert image bytes to Gemini-compatible format and get description"""
|
| 187 |
from google import genai
|
|
|
|
| 210 |
If there are not enough data, consider information from the surrounding context.
|
| 211 |
Additionally, if the image contains a truncated part, you must describe it and mark as a
|
| 212 |
part of some another image that goes before or after current image.
|
| 213 |
+
|
| 214 |
+
If an image is a multiple-choice question's options, make sure to modify your answer to add
|
| 215 |
+
"MCQ: A option B option C option D option", where MCQ is a descriptor and "option" would be
|
| 216 |
+
replaced with actual option from image.
|
| 217 |
"""},
|
| 218 |
{
|
| 219 |
"inline_data": {
|
|
|
|
| 233 |
except Exception as e:
|
| 234 |
logger.error(f"Error getting image description: {str(e)}")
|
| 235 |
return ("error", "Error describing image", None)
|
| 236 |
+
|
| 237 |
+
|
| 238 |
if __name__ == "__main__":
|
| 239 |
processor = Processor()
|
| 240 |
single_url = "https://quextro-resources.s3.eu-west-2.amazonaws.com/1739967958667-643657-mark-scheme-computer-principles.pdf?response-content-disposition=inline&X-Amz-Content-Sha256=UNSIGNED-PAYLOAD&X-Amz-Security-Token=IQoJb3JpZ2luX2VjEJT%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FwEaCWV1LXdlc3QtMiJGMEQCIARfSyuot0h2RNrcqVQkc2T%2B1fJZ64NfjmkmAFgCkTG6AiArmbJDAUr7T85HdqAT2RbyLhmiIgpSo3ci4%2FUtSap2wCrUAwi8%2F%2F%2F%2F%2F%2F%2F%2F%2F%2F8BEAAaDDUwOTM5OTYxODAzMCIMkfFm%2FgBrHsH1qh59KqgDjfZd1%2BKGzxkn7JorfQ07dL%2BL5fjCA6kmNAzCnCjDpTLnNjBfB1vnO2ZLvtC8RNvnaewY6tFWUfl39dC62ldnfajHeFmxkZqBcbDf3oOGnuO2PIvBgb5%2BvppVDkYjWz7vv5TzpgC2sVzjA38QMwxAnausYWDgspap7qjlfoLJUiBOq9SIMZyKVsfeAf4OiUl0TDc2nheqvNXOJy9TPh94KWbBT35vP3fU9A7ZdF4sElm4nVZMnOPdbR7%2Ba6F57nPLZvUaLZC5Nb011ef6%2BhAxr9yeONh5MAoTGUH2qzedDmN%2FbKannddBy%2FNIaP%2BhF7lWUkKemQrM5vajwU6k2Q45pLruKWRkjtrWxdmkQE4zb67ETj5eGL%2BlPPj%2BPtQWzF7UaoWPUH4tGBZ%2Bqdu479rU1ZSg%2B15lR%2F8SAgP%2BydATGwyRtXEvMRJZIiUems8i6ehxWC%2FscY2%2FtCk9OREKhLwOEEdJDAR4vqt68lnnvVomHrVjwNQvyP9A4V8Ct%2B0SjxP%2F86kJnX3o%2FVEoFT44JWICuMuf8kwoelUbZGPl6SaftGsRSUvoy7PV5TCN3du9BjrlAjKhLpjsCwgp1rJ8cPBFcUgOmL3iXrtHs3FhDLljxbXRZ%2FadHkxAlzf%2BXym%2BFBnhdCkDfmWcMEH3GAOFfv%2FlE5SsZMO1JoXbzQlO3OX6nrUacj7LF7ZoO8TYMVoTyEZSLEABNOU7KCILaFeDGRDJ8Ia5I3jnXvOVouFn2VnhykCuWPTunjkMEQBiHa3mbZP0mVcSviujHXatN11INiR%2BPwAN5oxKXeT25B%2FCCI3wib5Av2tzp8zuw8joib5PWNXOYfRgMR7R0Sj%2FjW5SxWr%2BTD9TAD3%2Fqj5pj3Oo13dNGdv5RwGqk1iHd8okpkFYlxEmXD2tTanpxX8ON1%2FLHz%2BNEUJDOogx8TLw5I6mkVs3zjoMhhwn2%2BWrlnNa%2F3i9lAGyLY6Ps4U23Hv7b4gpH4%2BeJN72Z95hrNtcumq4uuf0pRoJPQ9pjiZttjeDwNZzb7d3XuiEQeOgK8rpTeEgduxhdJOOLwZGrg%3D%3D&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=ASIAXNGUVKHXFLYKHBHD%2F20250220%2Feu-west-2%2Fs3%2Faws4_request&X-Amz-Date=20250220T111935Z&X-Amz-Expires=10800&X-Amz-SignedHeaders=host&X-Amz-Signature=64aa008fdafe72f1a693078156451c0f6f702e89e546954d6b3d61abf9f73ec8"
|
| 241 |
markdown_result = processor.process(single_url, key="1234323")
|
| 242 |
+
print("Single file Markdown:\n", markdown_result)
|
| 243 |
+
|
| 244 |
+
# if __name__ == "__main__":
|
| 245 |
+
# with open("./test_image.jpg", "rb") as file:
|
| 246 |
+
# test_image = file.read()
|
| 247 |
+
|
| 248 |
+
# print(call_gemini_for_image_description(test_image))
|
| 249 |
+
|
model_classification/svm_model.joblib
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:70dd9281be75d0e9d3889bb48ad65088344b83d9d7c33c682a012e5468440e1f
|
| 3 |
+
size 263076507
|