FreeBibTec2 / app.py
C2MV's picture
Update app.py
127ed20 verified
raw
history blame
56 kB
import os
import re
import json
import logging
import zipfile
import asyncio
import tempfile
from typing import Dict, List, Optional, Any, Tuple
from dataclasses import dataclass, field
from pathlib import Path
from datetime import datetime
import gradio as gr
from enum import Enum
import hashlib
import urllib.parse
import aiohttp
# Importar smolagents
from smolagents import CodeAgent, ToolCallingAgent, LiteLLMModel
from smolagents.tools import Tool, tool
from pydantic import BaseModel, Field
# Configuración de logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler('bibliography_nebius.log'),
logging.StreamHandler()
]
)
logger = logging.getLogger(__name__)
# ========== CONFIGURACIÓN NEBIUS API ==========
class NebiusAPI:
"""Cliente para API de Nebius AI"""
def __init__(self, api_key: str, base_url: str = "https://api.studio.nebius.com"):
self.api_key = api_key
self.base_url = base_url
self.headers = {
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json"
}
async def generate_text(self, prompt: str, model: str = "neural-chat-7b-v3-1",
max_tokens: int = 1000, temperature: float = 0.7) -> str:
"""Generar texto usando modelos de Nebius"""
url = f"{self.base_url}/v1/chat/completions"
payload = {
"model": model,
"messages": [
{"role": "user", "content": prompt}
],
"max_tokens": max_tokens,
"temperature": temperature,
"top_p": 0.95
}
try:
async with aiohttp.ClientSession() as session:
async with session.post(
url,
headers=self.headers,
json=payload,
timeout=30
) as response:
if response.status == 200:
data = await response.json()
return data.get("choices", [{}])[0].get("message", {}).get("content", "")
else:
error_text = await response.text()
logger.error(f"Nebius API error {response.status}: {error_text}")
return ""
except Exception as e:
logger.error(f"Error calling Nebius API: {e}")
return ""
async def extract_references(self, text: str) -> List[Dict[str, Any]]:
"""Usar Nebius para extraer referencias de texto"""
prompt = f"""Analiza el siguiente texto y extrae todas las referencias bibliográficas.
Identifica DOIs, ISBNs, URLs académicas, arXiv IDs y otras referencias académicas.
Texto:
{text[:5000]} # Limitar tamaño
Devuelve un JSON con el siguiente formato:
{{
"references": [
{{
"type": "doi|isbn|arxiv|url|pmid|other",
"identifier": "identificador_completo",
"raw_text": "texto_original_encontrado",
"confidence": 0.0-1.0,
"context": "texto_alrededor_del_identificador"
}}
]
}}
Solo devuelve el JSON, sin texto adicional."""
response = await self.generate_text(prompt, max_tokens=2000)
try:
# Buscar JSON en la respuesta
json_match = re.search(r'\{.*\}', response, re.DOTALL)
if json_match:
data = json.loads(json_match.group())
return data.get("references", [])
except Exception as e:
logger.error(f"Error parsing Nebius response: {e}")
return []
async def verify_reference(self, reference: Dict[str, Any]) -> Dict[str, Any]:
"""Verificar una referencia usando Nebius"""
prompt = f"""Verifica la siguiente referencia académica y proporciona información sobre su accesibilidad:
Tipo: {reference.get('type')}
Identificador: {reference.get('identifier')}
Contexto: {reference.get('context', 'No disponible')}
Analiza:
1. ¿Es un identificador válido?
2. ¿Dónde podría encontrarse este recurso?
3. ¿Es probable que esté disponible en acceso abierto?
4. Proporciona posibles URLs para acceder al recurso.
Devuelve un JSON con el siguiente formato:
{{
"valid": true/false,
"confidence": 0.0-1.0,
"sources": ["lista", "de", "posibles", "fuentes"],
"likely_open_access": true/false,
"suggested_urls": ["url1", "url2"],
"notes": "notas_adicionales"
}}"""
response = await self.generate_text(prompt, max_tokens=1000)
try:
json_match = re.search(r'\{.*\}', response, re.DOTALL)
if json_match:
return json.loads(json_match.group())
except Exception as e:
logger.error(f"Error parsing verification response: {e}")
return {"valid": False, "confidence": 0.0, "sources": [], "notes": "Error en verificación"}
# ========== MODELOS DE DATOS ==========
class ResourceType(str, Enum):
DOI = "doi"
ISBN = "isbn"
ARXIV = "arxiv"
URL = "url"
PMID = "pmid"
BIBTEX = "bibtex"
CITATION = "citation"
UNKNOWN = "unknown"
class CitationModel(BaseModel):
id: str
raw_text: str
resource_type: ResourceType
identifier: str
metadata: Dict[str, Any] = Field(default_factory=dict)
confidence: float = 0.0
extracted_from: str
position: Tuple[int, int] = (0, 0)
nebius_verified: bool = False
nebius_confidence: float = 0.0
class VerificationResult(BaseModel):
citation: CitationModel
verified: bool
verification_source: str
download_url: Optional[str]
file_format: Optional[str]
file_size: Optional[int]
quality_score: float
notes: List[str] = Field(default_factory=list)
nebius_analysis: Optional[Dict[str, Any]] = None
class ProcessingReport(BaseModel):
input_file: str
total_citations: int
verified_resources: List[VerificationResult]
downloaded_files: List[str]
failed_verifications: List[CitationModel]
processing_time: float
summary: Dict[str, Any] = Field(default_factory=dict)
timestamp: str = Field(default_factory=lambda: datetime.now().isoformat())
nebius_usage: Dict[str, Any] = Field(default_factory=dict)
# ========== HERRAMIENTAS CON INTEGRACIÓN NEBIUS ==========
class NebiusEnhancedExtractionTool(Tool):
name = "nebius_extract_references"
description = """
Extract bibliographic references using Nebius AI for enhanced accuracy.
Args:
text (str): Text to analyze
nebius_api_key (str): Nebius API key
use_ai_enhancement (bool): Whether to use Nebius AI for enhancement
Returns:
List[Dict]: Extracted references with Nebius AI analysis
"""
def __init__(self):
super().__init__()
# Patrones básicos para extracción inicial
self.patterns = {
ResourceType.DOI: [
r'\b10\.\d{4,9}/[-._;()/:A-Z0-9]+\b',
r'doi:\s*(10\.\d{4,9}/[-._;()/:A-Z0-9]+)',
],
ResourceType.ISBN: [
r'ISBN(?:-1[03])?:?\s*(?=[0-9X]{10})(?:97[89][- ]?)?[0-9]{1,5}[- ]?[0-9]+[- ]?[0-9]+[- ]?[0-9X]',
],
ResourceType.ARXIV: [
r'arXiv:\s*(\d{4}\.\d{4,5}(v\d+)?)',
r'arxiv:\s*([a-z\-]+/\d{7})'
],
}
def forward(self, text: str, nebius_api_key: str = None,
use_ai_enhancement: bool = False) -> List[Dict[str, Any]]:
"""Extraer referencias con opción de mejora con Nebius"""
# Extracción básica
basic_references = self._extract_basic(text)
if not use_ai_enhancement or not nebius_api_key:
return basic_references
# Mejora con Nebius AI
try:
nebius = NebiusAPI(nebius_api_key)
# Usar asyncio en contexto síncrono
import nest_asyncio
nest_asyncio.apply()
# Extraer con Nebius
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
nebius_references = loop.run_until_complete(
nebius.extract_references(text[:10000]) # Limitar para API
)
loop.close()
# Combinar resultados
enhanced_references = self._merge_references(basic_references, nebius_references)
return enhanced_references
except Exception as e:
logger.error(f"Error using Nebius enhancement: {e}")
return basic_references
def _extract_basic(self, text: str) -> List[Dict[str, Any]]:
"""Extracción básica de referencias"""
references = []
for resource_type, patterns in self.patterns.items():
for pattern in patterns:
matches = re.finditer(pattern, text, re.IGNORECASE)
for match in matches:
identifier = match.group(1) if match.groups() else match.group(0)
identifier = self._clean_identifier(identifier, resource_type)
if identifier:
reference = {
"id": hashlib.md5(identifier.encode()).hexdigest()[:12],
"raw_text": match.group(0),
"type": resource_type.value,
"identifier": identifier,
"confidence": 0.8,
"context": self._get_context(text, match.start(), match.end()),
"position": (match.start(), match.end()),
"extraction_method": "regex"
}
references.append(reference)
return references
def _merge_references(self, basic: List[Dict], nebius: List[Dict]) -> List[Dict]:
"""Combinar referencias de extracción básica y Nebius"""
merged = basic.copy()
for nebius_ref in nebius:
# Verificar si ya existe
exists = False
for ref in merged:
if ref.get('identifier') == nebius_ref.get('identifier'):
exists = True
# Actualizar confianza y metadata
ref['confidence'] = max(ref.get('confidence', 0),
nebius_ref.get('confidence', 0))
ref['extraction_method'] = 'regex+nebius'
break
if not exists:
# Convertir formato Nebius a nuestro formato
new_ref = {
"id": hashlib.md5(
nebius_ref.get('identifier', '').encode()
).hexdigest()[:12],
"raw_text": nebius_ref.get('raw_text', ''),
"type": nebius_ref.get('type', 'unknown'),
"identifier": nebius_ref.get('identifier', ''),
"confidence": nebius_ref.get('confidence', 0.7),
"context": nebius_ref.get('context', ''),
"position": (0, 0),
"extraction_method": 'nebius'
}
merged.append(new_ref)
return merged
def _clean_identifier(self, identifier: str, resource_type: ResourceType) -> str:
"""Limpiar identificador"""
identifier = identifier.strip()
prefixes = ['doi:', 'DOI:', 'arxiv:', 'arXiv:', 'isbn:', 'ISBN:', 'pmid:', 'PMID:']
for prefix in prefixes:
if identifier.startswith(prefix):
identifier = identifier[len(prefix):].strip()
identifier = identifier.strip('"\'<>()[]{}')
if resource_type == ResourceType.URL:
if not identifier.startswith(('http://', 'https://')):
identifier = f'https://{identifier}'
return identifier
def _get_context(self, text: str, start: int, end: int, window: int = 100) -> str:
"""Obtener contexto alrededor del match"""
context_start = max(0, start - window)
context_end = min(len(text), end + window)
return text[context_start:context_end]
class NebiusVerificationTool(Tool):
name = "nebius_verify_reference"
description = """
Verify academic references using Nebius AI analysis.
Args:
reference (Dict): Reference to verify
nebius_api_key (str): Nebius API key
deep_verify (bool): Whether to perform deep verification
Returns:
Dict: Verification results with Nebius analysis
"""
def __init__(self):
super().__init__()
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
def forward(self, reference: Dict[str, Any], nebius_api_key: str = None,
deep_verify: bool = False) -> Dict[str, Any]:
"""Verificar referencia con Nebius"""
result = {
"reference": reference,
"verified": False,
"verification_source": "direct",
"download_url": None,
"file_format": None,
"file_size": None,
"quality_score": 0.0,
"notes": [],
"nebius_analysis": None
}
# Verificación directa primero
direct_result = self._direct_verification(reference)
if direct_result.get("verified"):
result.update(direct_result)
result["quality_score"] = 0.9
# Verificación con Nebius si está disponible
if nebius_api_key and deep_verify:
nebius_result = self._nebius_verification(reference, nebius_api_key)
result["nebius_analysis"] = nebius_result
if nebius_result.get("valid", False):
result["verified"] = True
result["verification_source"] = "nebius"
result["quality_score"] = max(
result.get("quality_score", 0),
nebius_result.get("confidence", 0)
)
# Agregar URLs sugeridas por Nebius
suggested_urls = nebius_result.get("suggested_urls", [])
if suggested_urls and not result.get("download_url"):
result["download_url"] = suggested_urls[0]
result["notes"].append(
f"Nebius analysis: {nebius_result.get('notes', 'No notes')}"
)
return result
def _direct_verification(self, reference: Dict[str, Any]) -> Dict[str, Any]:
"""Verificación directa de la referencia"""
import requests
ref_type = reference.get("type", "")
identifier = reference.get("identifier", "")
try:
if ref_type == "doi":
return self._verify_doi(identifier)
elif ref_type == "arxiv":
return self._verify_arxiv(identifier)
elif ref_type == "url":
return self._verify_url(identifier)
elif ref_type == "isbn":
return self._verify_isbn(identifier)
except Exception as e:
logger.error(f"Direct verification error: {e}")
return {"verified": False, "notes": [f"Direct verification failed for {ref_type}"]}
def _verify_doi(self, doi: str) -> Dict[str, Any]:
"""Verificar DOI"""
import requests
try:
# Crossref
url = f"https://api.crossref.org/works/{doi}"
response = requests.get(url, headers=self.headers, timeout=10)
if response.status_code == 200:
data = response.json()
work = data.get('message', {})
result = {"verified": True, "notes": ["Verified via Crossref"]}
# Buscar PDF
links = work.get('link', [])
for link in links:
if link.get('content-type') == 'application/pdf':
result["download_url"] = link.get('URL')
result["file_format"] = "pdf"
break
return result
except Exception as e:
logger.error(f"DOI verification error: {e}")
return {"verified": False}
def _verify_arxiv(self, arxiv_id: str) -> Dict[str, Any]:
"""Verificar arXiv ID"""
import requests
try:
# Limpiar ID
if 'arxiv:' in arxiv_id.lower():
arxiv_id = arxiv_id.split(':')[-1].strip()
# Verificar existencia
api_url = f"http://export.arxiv.org/api/query?id_list={arxiv_id}"
response = requests.get(api_url, headers=self.headers, timeout=10)
if response.status_code == 200:
return {
"verified": True,
"download_url": f"https://arxiv.org/pdf/{arxiv_id}.pdf",
"file_format": "pdf",
"notes": ["arXiv paper available"]
}
except Exception as e:
logger.error(f"arXiv verification error: {e}")
return {"verified": False}
def _verify_url(self, url: str) -> Dict[str, Any]:
"""Verificar URL"""
import requests
try:
response = requests.head(url, headers=self.headers, timeout=10, allow_redirects=True)
if response.status_code == 200:
result = {"verified": True, "notes": [f"URL accessible: {response.status_code}"]}
# Verificar si es PDF
content_type = response.headers.get('content-type', '')
if 'application/pdf' in content_type:
result["download_url"] = url
result["file_format"] = "pdf"
return result
except Exception as e:
logger.error(f"URL verification error: {e}")
return {"verified": False}
def _verify_isbn(self, isbn: str) -> Dict[str, Any]:
"""Verificar ISBN"""
import requests
try:
# Open Library
url = f"https://openlibrary.org/api/books?bibkeys=ISBN:{isbn}&format=json"
response = requests.get(url, headers=self.headers, timeout=10)
if response.status_code == 200:
data = response.json()
if data:
return {
"verified": True,
"notes": ["ISBN found in Open Library"]
}
except Exception as e:
logger.error(f"ISBN verification error: {e}")
return {"verified": False}
def _nebius_verification(self, reference: Dict[str, Any], api_key: str) -> Dict[str, Any]:
"""Verificación con Nebius AI"""
try:
nebius = NebiusAPI(api_key)
# Usar asyncio en contexto síncrono
import nest_asyncio
nest_asyncio.apply()
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
analysis = loop.run_until_complete(
nebius.verify_reference(reference)
)
loop.close()
return analysis
except Exception as e:
logger.error(f"Nebius verification error: {e}")
return {"valid": False, "confidence": 0.0, "notes": f"Error: {str(e)}"}
# ========== SISTEMA PRINCIPAL CON NEBIUS ==========
class NebiusBibliographySystem:
"""Sistema de procesamiento bibliográfico con Nebius AI"""
def __init__(self, config: Dict[str, Any]):
self.config = config
self.nebius_api_key = config.get("nebius_api_key")
self.use_nebius = bool(self.nebius_api_key)
# Inicializar herramientas
self.extraction_tool = NebiusEnhancedExtractionTool()
self.verification_tool = NebiusVerificationTool()
# Configurar modelo LiteLLM para agentes
self.llm_model = self._configure_llm()
# Directorios de salida
self.output_base = "nebius_bibliography"
self.download_dir = os.path.join(self.output_base, "downloads")
self.report_dir = os.path.join(self.output_base, "reports")
self.log_dir = os.path.join(self.output_base, "logs")
# Crear directorios
for dir_path in [self.output_base, self.download_dir, self.report_dir, self.log_dir]:
os.makedirs(dir_path, exist_ok=True)
# Estadísticas
self.stats = {
"total_processed": 0,
"total_references": 0,
"nebius_calls": 0,
"success_rate": 0.0
}
logger.info(f"Nebius system initialized. Nebius AI: {'Enabled' if self.use_nebius else 'Disabled'}")
def _configure_llm(self):
"""Configurar modelo LiteLLM"""
provider = self.config.get("llm_provider", "openai")
if provider == "nebius" and self.nebius_api_key:
# Configurar Nebius como proveedor personalizado
return LiteLLMModel(
model_id=self.config.get("llm_model", "neural-chat-7b-v3-1"),
api_key=self.nebius_api_key,
api_base=self.config.get("nebius_api_base", "https://api.studio.nebius.com/v1")
)
elif provider == "openai":
return LiteLLMModel(
model_id=self.config.get("llm_model", "gpt-4"),
api_key=self.config.get("openai_api_key")
)
else:
# Default to Nebius if available
if self.nebius_api_key:
return LiteLLMModel(
model_id="neural-chat-7b-v3-1",
api_key=self.nebius_api_key,
api_base="https://api.studio.nebius.com/v1"
)
else:
return LiteLLMModel(model_id="gpt-4")
async def process_document(self, file_path: str, process_id: str = None) -> Dict[str, Any]:
"""Procesar documento completo con Nebius"""
import time
start_time = time.time()
# Generar ID de proceso
process_id = process_id or self._generate_process_id(file_path)
logger.info(f"[{process_id}] Processing document: {file_path}")
try:
# 1. Leer archivo
file_content = self._read_file(file_path)
if not file_content:
return self._error_result(process_id, "Empty or unreadable file")
# 2. Extraer referencias
logger.info(f"[{process_id}] Extracting references...")
references = self.extraction_tool.forward(
text=file_content,
nebius_api_key=self.nebius_api_key,
use_ai_enhancement=self.use_nebius
)
if self.use_nebius:
self.stats["nebius_calls"] += 1
self.stats["total_references"] += len(references)
logger.info(f"[{process_id}] Found {len(references)} references")
# 3. Verificar referencias
logger.info(f"[{process_id}] Verifying references...")
verification_results = []
failed_verifications = []
for i, ref in enumerate(references):
if i % 5 == 0: # Log cada 5 referencias
logger.info(f"[{process_id}] Verified {i}/{len(references)}")
# Verificar referencia
verification = self.verification_tool.forward(
reference=ref,
nebius_api_key=self.nebius_api_key,
deep_verify=self.use_nebius
)
if verification.get("verified"):
# Convertir a modelo
citation = CitationModel(
id=ref.get("id"),
raw_text=ref.get("raw_text", ""),
resource_type=ResourceType(ref.get("type", "unknown")),
identifier=ref.get("identifier", ""),
confidence=ref.get("confidence", 0.0),
extracted_from=file_path,
position=ref.get("position", (0, 0)),
nebius_verified=self.use_nebius,
nebius_confidence=verification.get("quality_score", 0.0)
)
vr = VerificationResult(
citation=citation,
verified=True,
verification_source=verification.get("verification_source", "unknown"),
download_url=verification.get("download_url"),
file_format=verification.get("file_format"),
file_size=verification.get("file_size"),
quality_score=verification.get("quality_score", 0.0),
notes=verification.get("notes", []),
nebius_analysis=verification.get("nebius_analysis")
)
verification_results.append(vr)
else:
# Referencia fallida
citation = CitationModel(
id=ref.get("id"),
raw_text=ref.get("raw_text", ""),
resource_type=ResourceType(ref.get("type", "unknown")),
identifier=ref.get("identifier", ""),
confidence=ref.get("confidence", 0.0),
extracted_from=file_path,
position=ref.get("position", (0, 0)),
nebius_verified=False,
nebius_confidence=0.0
)
failed_verifications.append(citation)
# 4. Descargar archivos verificados
logger.info(f"[{process_id}] Downloading files...")
downloaded_files = await self._download_files(
verification_results,
process_id
)
# 5. Generar reporte
processing_time = time.time() - start_time
report = ProcessingReport(
input_file=file_path,
total_citations=len(references),
verified_resources=verification_results,
downloaded_files=downloaded_files,
failed_verifications=failed_verifications,
processing_time=processing_time,
summary={
"success_rate": len(verification_results) / max(1, len(references)),
"download_rate": len(downloaded_files) / max(1, len(verification_results)),
"avg_quality": sum(vr.quality_score for vr in verification_results) / max(1, len(verification_results))
},
nebius_usage={
"enabled": self.use_nebius,
"calls": self.stats["nebius_calls"],
"enhanced_references": sum(1 for vr in verification_results if vr.nebius_analysis)
}
)
# 6. Guardar resultados
self._save_results(report, process_id)
self.stats["total_processed"] += 1
self.stats["success_rate"] = report.summary.get("success_rate", 0.0)
logger.info(f"[{process_id}] Processing completed in {processing_time:.2f}s")
return {
"success": True,
"process_id": process_id,
"report": report.dict(),
"zip_path": self._create_zip(report, process_id),
"summary": {
"references_found": len(references),
"verified": len(verification_results),
"downloaded": len(downloaded_files),
"success_rate": f"{report.summary.get('success_rate', 0) * 100:.1f}%",
"processing_time": f"{processing_time:.2f}s"
}
}
except Exception as e:
logger.error(f"[{process_id}] Processing error: {e}")
return self._error_result(process_id, str(e))
def _read_file(self, file_path: str) -> str:
"""Leer contenido del archivo"""
try:
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
return f.read()
except Exception as e:
logger.error(f"Error reading file {file_path}: {e}")
return ""
async def _download_files(self, verification_results: List[VerificationResult],
process_id: str) -> List[str]:
"""Descargar archivos de URLs verificadas"""
downloaded_files = []
for i, vr in enumerate(verification_results):
if vr.download_url:
try:
file_path = await self._download_file(
vr.download_url,
vr.citation.identifier,
process_id,
i
)
if file_path:
downloaded_files.append(file_path)
except Exception as e:
logger.error(f"Download failed for {vr.citation.identifier}: {e}")
return downloaded_files
async def _download_file(self, url: str, identifier: str,
process_id: str, index: int) -> Optional[str]:
"""Descargar un archivo individual"""
import aiohttp
try:
# Crear nombre de archivo seguro
safe_name = re.sub(r'[^\w\-\.]', '_', identifier)
if len(safe_name) > 100:
safe_name = safe_name[:100]
# Determinar extensión
extension = self._get_extension_from_url(url)
if not extension:
extension = ".pdf" # Default
filename = f"{process_id}_{index:03d}_{safe_name}{extension}"
filepath = os.path.join(self.download_dir, filename)
# Descargar
timeout = aiohttp.ClientTimeout(total=60)
async with aiohttp.ClientSession(timeout=timeout) as session:
async with session.get(url, headers={'User-Agent': 'Mozilla/5.0'}) as response:
if response.status == 200:
content = await response.read()
# Verificar que sea un archivo válido
if len(content) > 100: # Archivo no vacío
with open(filepath, 'wb') as f:
f.write(content)
logger.info(f"Downloaded: {filename} ({len(content)} bytes)")
return filepath
return None
except Exception as e:
logger.error(f"Download error for {url}: {e}")
return None
def _get_extension_from_url(self, url: str) -> str:
"""Obtener extensión de archivo desde URL"""
url_lower = url.lower()
if '.pdf' in url_lower:
return '.pdf'
elif '.docx' in url_lower or '.doc' in url_lower:
return '.docx'
elif '.html' in url_lower or '.htm' in url_lower:
return '.html'
elif '.txt' in url_lower:
return '.txt'
elif '.epub' in url_lower:
return '.epub'
return ""
def _generate_process_id(self, file_path: str) -> str:
"""Generar ID único de proceso"""
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
file_hash = hashlib.md5(file_path.encode()).hexdigest()[:6]
return f"NB_{timestamp}_{file_hash}"
def _save_results(self, report: ProcessingReport, process_id: str):
"""Guardar resultados en disco"""
# Guardar reporte JSON
report_path = os.path.join(self.report_dir, f"{process_id}_report.json")
with open(report_path, 'w', encoding='utf-8') as f:
json.dump(report.dict(), f, indent=2, default=str)
# Guardar resumen en texto
summary_path = os.path.join(self.report_dir, f"{process_id}_summary.txt")
with open(summary_path, 'w', encoding='utf-8') as f:
f.write(self._generate_text_summary(report))
def _create_zip(self, report: ProcessingReport, process_id: str) -> str:
"""Crear archivo ZIP con resultados"""
import zipfile
zip_path = os.path.join(self.output_base, f"{process_id}_results.zip")
with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
# Agregar reportes
report_files = [
f for f in os.listdir(self.report_dir)
if f.startswith(process_id)
]
for file in report_files:
filepath = os.path.join(self.report_dir, file)
zipf.write(filepath, f"reports/{file}")
# Agregar archivos descargados
for file_path in report.downloaded_files:
if os.path.exists(file_path):
filename = os.path.basename(file_path)
zipf.write(file_path, f"downloads/{filename}")
# Agregar log
log_path = os.path.join(self.log_dir, f"{process_id}_log.txt")
with open(log_path, 'w') as f:
f.write(f"Process ID: {process_id}\n")
f.write(f"Time: {datetime.now().isoformat()}\n")
f.write(f"Success rate: {report.summary.get('success_rate', 0) * 100:.1f}%\n")
zipf.write(log_path, "process_log.txt")
return zip_path
def _generate_text_summary(self, report: ProcessingReport) -> str:
"""Generar resumen en texto"""
summary = f"""
NEBIUS BIBLIOGRAPHY PROCESSING REPORT
=====================================
Process ID: Generated automatically
Input File: {report.input_file}
Processing Time: {report.processing_time:.2f} seconds
Timestamp: {report.timestamp}
SUMMARY STATISTICS
------------------
Total References Found: {report.total_citations}
Successfully Verified: {len(report.verified_resources)}
Files Downloaded: {len(report.downloaded_files)}
Verification Success Rate: {report.summary.get('success_rate', 0) * 100:.1f}%
Average Quality Score: {report.summary.get('avg_quality', 0):.2f}
NEBIUS AI USAGE
---------------
Enabled: {report.nebius_usage.get('enabled', False)}
API Calls: {report.nebius_usage.get('calls', 0)}
Enhanced References: {report.nebius_usage.get('enhanced_references', 0)}
VERIFIED RESOURCES (Top 10)
---------------------------
"""
for i, vr in enumerate(report.verified_resources[:10], 1):
summary += f"\n{i}. {vr.citation.identifier}"
summary += f"\n Type: {vr.citation.resource_type.value}"
summary += f"\n Source: {vr.verification_source}"
summary += f"\n Quality: {vr.quality_score:.2f}"
summary += f"\n Nebius Enhanced: {vr.citation.nebius_verified}"
if vr.download_url:
summary += f"\n Downloaded: Yes"
summary += "\n"
if report.failed_verifications:
summary += f"\nFAILED VERIFICATIONS ({len(report.failed_verifications)})\n"
summary += "-" * 40 + "\n"
for citation in report.failed_verifications[:5]:
summary += f"- {citation.identifier} ({citation.resource_type.value})\n"
summary += f"\nFILES DOWNLOADED\n"
summary += "-" * 40 + "\n"
for file_path in report.downloaded_files:
if os.path.exists(file_path):
file_size = os.path.getsize(file_path)
summary += f"- {os.path.basename(file_path)} ({file_size} bytes)\n"
return summary
def _error_result(self, process_id: str, error: str) -> Dict[str, Any]:
"""Generar resultado de error"""
return {
"success": False,
"process_id": process_id,
"error": error,
"timestamp": datetime.now().isoformat()
}
def get_stats(self) -> Dict[str, Any]:
"""Obtener estadísticas del sistema"""
return {
"total_processed": self.stats["total_processed"],
"total_references": self.stats["total_references"],
"nebius_calls": self.stats["nebius_calls"],
"success_rate": self.stats["success_rate"],
"output_directory": self.output_base
}
# ========== INTERFAZ GRADIO MEJORADA ==========
def create_nebius_interface():
"""Crear interfaz Gradio con soporte para Nebius"""
system = None
current_process = None
def initialize_system(provider, model, nebius_key, nebius_base, openai_key):
"""Inicializar sistema con configuración"""
nonlocal system
config = {
"llm_provider": provider,
"llm_model": model,
"nebius_api_key": nebius_key,
"nebius_api_base": nebius_base or "https://api.studio.nebius.com/v1",
"openai_api_key": openai_key,
"use_nebius": bool(nebius_key)
}
try:
system = NebiusBibliographySystem(config)
return "✅ Sistema inicializado con Nebius AI" if nebius_key else "✅ Sistema inicializado (sin Nebius)"
except Exception as e:
return f"❌ Error: {str(e)}"
async def process_document(file_obj, use_nebius, progress=gr.Progress()):
"""Procesar documento"""
nonlocal system, current_process
if not system:
return None, "❌ Sistema no inicializado", "", "", ""
try:
progress(0, desc="Preparando archivo...")
# Guardar archivo temporalmente
import tempfile
import shutil
temp_dir = tempfile.mkdtemp()
file_path = os.path.join(temp_dir, file_obj.name)
shutil.copy(file_obj.name, file_path)
progress(0.1, desc="Procesando con Nebius..." if use_nebius else "Procesando...")
# Procesar documento
result = await system.process_document(file_path)
if not result.get("success"):
# Limpiar temporal
shutil.rmtree(temp_dir, ignore_errors=True)
return None, f"❌ Error: {result.get('error')}", "", "", ""
current_process = result.get("process_id")
summary = result.get("summary", {})
progress(0.9, desc="Generando reportes...")
# Generar visualizaciones
report_data = result.get("report", {})
# HTML output
html_output = self._generate_html_report(report_data)
# Text output
text_output = self._generate_text_report(report_data)
# JSON output
json_output = json.dumps(report_data, indent=2, default=str)
# Statistics
stats_output = self._generate_stats_display(summary)
progress(1.0, desc="Completado!")
# Limpiar temporal
shutil.rmtree(temp_dir, ignore_errors=True)
return (
result.get("zip_path"),
f"✅ Proceso {current_process} completado",
html_output,
text_output,
json_output,
stats_output
)
except Exception as e:
logger.error(f"Processing error: {e}")
return None, f"❌ Error: {str(e)}", "", "", "", ""
def _generate_html_report(self, report_data: Dict) -> str:
"""Generar reporte HTML"""
verified = len(report_data.get("verified_resources", []))
total = report_data.get("total_citations", 0)
success_rate = (verified / max(1, total)) * 100
nebius_usage = report_data.get("nebius_usage", {})
html = f"""
<div style="font-family: Arial, sans-serif; padding: 20px;">
<h2 style="color: #2c3e50;">📊 Reporte de Procesamiento Nebius</h2>
<div style="background: #ecf0f1; padding: 15px; border-radius: 10px; margin: 15px 0;">
<h3 style="color: #34495e;">📈 Resumen General</h3>
<div style="display: grid; grid-template-columns: repeat(2, 1fr); gap: 10px;">
<div style="background: white; padding: 10px; border-radius: 5px;">
<strong>Referencias Encontradas</strong><br>
<span style="font-size: 24px; color: #3498db;">{total}</span>
</div>
<div style="background: white; padding: 10px; border-radius: 5px;">
<strong>Verificadas</strong><br>
<span style="font-size: 24px; color: #2ecc71;">{verified}</span>
</div>
<div style="background: white; padding: 10px; border-radius: 5px;">
<strong>Tasa de Éxito</strong><br>
<span style="font-size: 24px; color: #9b59b6;">{success_rate:.1f}%</span>
</div>
<div style="background: white; padding: 10px; border-radius: 5px;">
<strong>Tiempo</strong><br>
<span style="font-size: 24px; color: #e74c3c;">{report_data.get('processing_time', 0):.1f}s</span>
</div>
</div>
</div>
<div style="background: #d5f4e6; padding: 15px; border-radius: 10px; margin: 15px 0;">
<h3 style="color: #27ae60;">🤖 Nebius AI</h3>
<p><strong>Estado:</strong> {'✅ Activado' if nebius_usage.get('enabled') else '❌ Desactivado'}</p>
<p><strong>Llamadas API:</strong> {nebius_usage.get('calls', 0)}</p>
<p><strong>Referencias Mejoradas:</strong> {nebius_usage.get('enhanced_references', 0)}</p>
</div>
<div style="background: #e8f4fc; padding: 15px; border-radius: 10px; margin: 15px 0;">
<h3 style="color: #2980b9;">📥 Descargas</h3>
<p><strong>Archivos Descargados:</strong> {len(report_data.get('downloaded_files', []))}</p>
<ul>
"""
for file in report_data.get("downloaded_files", [])[:5]:
filename = os.path.basename(file)
html += f'<li>{filename}</li>'
html += """
</ul>
</div>
<div style="background: #fdebd0; padding: 15px; border-radius: 10px; margin: 15px 0;">
<h3 style="color: #d35400;">⚠️ Referencias No Verificadas</h3>
<p><strong>Total:</strong> {failed}</p>
""".format(failed=len(report_data.get("failed_verifications", [])))
html += """
</div>
</div>
"""
return html
def _generate_text_report(self, report_data: Dict) -> str:
"""Generar reporte en texto"""
verified = len(report_data.get("verified_resources", []))
total = report_data.get("total_citations", 0)
text = f"""
REPORTE DE PROCESAMIENTO
========================
Archivo: {report_data.get('input_file', 'Desconocido')}
Fecha: {report_data.get('timestamp', '')}
ESTADÍSTICAS:
-------------
• Referencias encontradas: {total}
• Referencias verificadas: {verified}
• Archivos descargados: {len(report_data.get('downloaded_files', []))}
• Tiempo de procesamiento: {report_data.get('processing_time', 0):.2f}s
• Tasa de éxito: {(verified/max(1, total))*100:.1f}%
NEBIUS AI:
----------
• Estado: {'Activado' if report_data.get('nebius_usage', {}).get('enabled') else 'Desactivado'}
• Llamadas API: {report_data.get('nebius_usage', {}).get('calls', 0)}
• Referencias mejoradas: {report_data.get('nebius_usage', {}).get('enhanced_references', 0)}
Para más detalles, consulte el archivo ZIP con el reporte completo.
"""
return text
def _generate_stats_display(self, summary: Dict) -> str:
"""Generar display de estadísticas"""
return f"""
⚡ PROCESO COMPLETADO ⚡
📊 Estadísticas Rápidas:
• Referencias: {summary.get('references_found', 0)}
• Verificadas: {summary.get('verified', 0)}
• Descargadas: {summary.get('downloaded', 0)}
• Tasa de éxito: {summary.get('success_rate', '0%')}
• Tiempo: {summary.get('processing_time', '0s')}
"""
def get_system_stats():
"""Obtener estadísticas del sistema"""
nonlocal system
if not system:
return "❌ Sistema no inicializado"
stats = system.get_stats()
return f"""
📈 Estadísticas del Sistema Nebius:
• Documentos procesados: {stats.get('total_processed', 0)}
• Referencias totales: {stats.get('total_references', 0)}
• Llamadas Nebius API: {stats.get('nebius_calls', 0)}
• Tasa de éxito promedio: {stats.get('success_rate', 0) * 100:.1f}%
• Directorio de salida: {stats.get('output_directory', 'N/A')}
"""
# Crear interfaz
with gr.Blocks(title="Nebius Bibliography System", theme=gr.themes.Soft()) as interface:
gr.Markdown("# 📚 Sistema de Recopilación Bibliográfica con Nebius AI")
gr.Markdown("Procesa documentos académicos usando Nebius AI para extracción y verificación inteligente")
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("### ⚙️ Configuración Nebius AI")
provider = gr.Dropdown(
choices=["nebius", "openai"],
label="Proveedor de IA Principal",
value="nebius",
info="Selecciona Nebius para usar la API de Nebius AI"
)
model = gr.Textbox(
label="Modelo",
value="neural-chat-7b-v3-1",
placeholder="Modelo de Nebius (ej: neural-chat-7b-v3-1)"
)
nebius_key = gr.Textbox(
label="Nebius API Key",
type="password",
placeholder="Ingresa tu API Key de Nebius"
)
nebius_base = gr.Textbox(
label="Nebius API Base (opcional)",
value="https://api.studio.nebius.com/v1",
placeholder="URL base de la API de Nebius"
)
openai_key = gr.Textbox(
label="OpenAI API Key (respaldo)",
type="password",
placeholder="Opcional: Key de OpenAI como respaldo"
)
init_btn = gr.Button("🚀 Inicializar Sistema Nebius", variant="primary")
init_status = gr.Markdown("")
gr.Markdown("---")
stats_btn = gr.Button("📊 Estadísticas del Sistema")
system_stats = gr.Markdown("")
with gr.Column(scale=2):
gr.Markdown("### 📄 Procesar Documento")
file_input = gr.File(
label="Sube tu documento",
file_types=[".txt", ".pdf", ".docx", ".html", ".md"]
)
use_nebius = gr.Checkbox(
label="Usar Nebius AI para mejora de precisión",
value=True
)
process_btn = gr.Button("🔍 Procesar con Nebius AI", variant="primary")
gr.Markdown("### 📦 Resultados")
result_file = gr.File(label="Descargar Paquete Completo (ZIP)")
result_status = gr.Markdown("")
stats_display = gr.Markdown("")
with gr.Tabs():
with gr.TabItem("📋 Vista HTML"):
html_output = gr.HTML(label="Reporte Interactivo")
with gr.TabItem("📝 Texto Plano"):
text_output = gr.Textbox(
label="Resumen",
lines=15,
max_lines=30
)
with gr.TabItem("🔧 JSON Completo"):
json_output = gr.Code(
label="Datos Completos",
language="json",
lines=20
)
# Conectar eventos
init_btn.click(
initialize_system,
inputs=[provider, model, nebius_key, nebius_base, openai_key],
outputs=init_status
)
process_btn.click(
process_document,
inputs=[file_input, use_nebius],
outputs=[result_file, result_status, html_output, text_output, json_output, stats_display]
)
stats_btn.click(
get_system_stats,
outputs=system_stats
)
# Información
gr.Markdown("""
### 📌 Características Nebius AI
**🔍 Extracción Inteligente:**
- Identificación contextual de referencias
- Corrección automática de identificadores
- Clasificación por tipo de recurso
**✅ Verificación Avanzada:**
- Análisis de accesibilidad
- Detección de acceso abierto
- Sugerencias de fuentes alternativas
**📊 Reportes Mejorados:**
- Métricas de confianza Nebius
- Análisis de calidad por referencia
- Estadísticas de uso de IA
### ⚠️ Notas Importantes
1. La API de Nebius requiere una key válida
2. Los archivos grandes pueden consumir más tokens
3. Se recomienda usar Nebius para máxima precisión
4. Mantén tu API key segura y no la compartas
### 🔗 Recursos
• [Documentación Nebius AI](https://docs.nebius.com)
• [Obtener API Key](https://studio.nebius.com)
• [Soporte Técnico](https://support.nebius.com)
""")
return interface
# ========== EJECUCIÓN PRINCIPAL ==========
async def main():
"""Función principal"""
import argparse
parser = argparse.ArgumentParser(description="Sistema Nebius de Recopilación Bibliográfica")
parser.add_argument("--mode", choices=["gui", "cli"], default="gui",
help="Modo de ejecución")
parser.add_argument("--file", type=str, help="Archivo a procesar (modo CLI)")
parser.add_argument("--nebius-key", help="API Key de Nebius")
parser.add_argument("--model", default="neural-chat-7b-v3-1", help="Modelo Nebius")
parser.add_argument("--api-base", default="https://api.studio.nebius.com/v1",
help="URL base de Nebius API")
args = parser.parse_args()
if args.mode == "gui":
# Ejecutar interfaz Gradio
interface = create_nebius_interface()
interface.launch(
server_name="0.0.0.0",
server_port=7860,
share=True,
debug=True
)
elif args.mode == "cli":
# Modo línea de comandos
if not args.file:
print("❌ Error: Debes especificar un archivo con --file")
return
if not os.path.exists(args.file):
print(f"❌ Error: Archivo no encontrado: {args.file}")
return
if not args.nebius_key:
print("⚠️ Advertencia: No se proporcionó API Key de Nebius")
use_nebius = False
nebius_key = None
else:
use_nebius = True
nebius_key = args.nebius_key
# Configurar sistema
config = {
"llm_provider": "nebius" if use_nebius else "openai",
"llm_model": args.model,
"nebius_api_key": nebius_key,
"nebius_api_base": args.api_base,
"use_nebius": use_nebius
}
system = NebiusBibliographySystem(config)
print(f"🔍 Procesando archivo: {args.file}")
print(f"🤖 Nebius AI: {'Activado' if use_nebius else 'Desactivado'}")
print("⏳ Procesando...")
result = await system.process_document(args.file)
if result.get("success"):
print(f"✅ Procesamiento completado!")
print(f"📊 ID del proceso: {result.get('process_id')}")
summary = result.get("summary", {})
print(f"""
📈 Resultados:
- Referencias encontradas: {summary.get('references_found', 0)}
- Referencias verificadas: {summary.get('verified', 0)}
- Archivos descargados: {summary.get('downloaded', 0)}
- Tasa de éxito: {summary.get('success_rate', '0%')}
- Tiempo de procesamiento: {summary.get('processing_time', '0s')}
📦 Paquete de resultados: {result.get('zip_path')}
📊 Estadísticas Nebius:
- Llamadas API: {result.get('report', {}).get('nebius_usage', {}).get('calls', 0)}
- Referencias mejoradas: {result.get('report', {}).get('nebius_usage', {}).get('enhanced_references', 0)}
""")
else:
print(f"❌ Error: {result.get('error')}")
if __name__ == "__main__":
import asyncio
asyncio.run(main())