WHO-rag-system / s3_utils.py
GitHub Actions
Deploy Mon Nov 24 21:18:16 UTC 2025
7e0bf54
raw
history blame
1.62 kB
from typing import Dict, List, Optional
# import boto3
import os
import json
import logging
S3_BUCKET_NAME = os.getenv("S3_BUCKET_NAME")
AWS_ACCESS_KEY = os.getenv("AWS_ACCESS_KEY_ID")
AWS_SECRET_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")
AWS_REGION = os.getenv("AWS_REGION")
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
def get_s3_client():
import boto3
if not AWS_ACCESS_KEY or not AWS_SECRET_KEY:
logging.warning("AWS credentials not found in environment. Using default config.")
return boto3.client('s3', region_name=AWS_REGION)
return boto3.client(
's3',
aws_access_key_id=AWS_ACCESS_KEY,
aws_secret_access_key=AWS_SECRET_KEY,
region_name=AWS_REGION
)
def download_chroma_folder_from_s3(s3_prefix: str, local_dir: str):
s3 = get_s3_client()
paginator = s3.get_paginator("list_objects_v2")
try:
for page in paginator.paginate(Bucket=S3_BUCKET_NAME, Prefix=s3_prefix):
for obj in page.get("Contents", []):
s3_key = obj["Key"]
rel_path = os.path.relpath(s3_key, s3_prefix)
local_path = os.path.join(local_dir, rel_path)
os.makedirs(os.path.dirname(local_path), exist_ok=True)
with open(local_path, "wb") as f:
s3.download_fileobj(Bucket=S3_BUCKET_NAME, Key=s3_key, Fileobj=f)
logging.info(f"ChromaDB folder downloaded from S3 to {local_dir} successfully.")
except Exception as e:
logging.error(f"Failed to download ChromaDB folder from S3: {e}")