Spaces:
Sleeping
Sleeping
| import os | |
| import re | |
| import json | |
| import logging | |
| import zipfile | |
| import asyncio | |
| import tempfile | |
| from typing import Dict, List, Optional, Any, Tuple | |
| from dataclasses import dataclass, field | |
| from pathlib import Path | |
| from datetime import datetime | |
| import gradio as gr | |
| from enum import Enum | |
| import hashlib | |
| import urllib.parse | |
| import aiohttp | |
| # Importar smolagents | |
| from smolagents import CodeAgent, ToolCallingAgent, LiteLLMModel | |
| from smolagents.tools import Tool, tool | |
| from pydantic import BaseModel, Field | |
| # Configuración de logging | |
| logging.basicConfig( | |
| level=logging.INFO, | |
| format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', | |
| handlers=[ | |
| logging.FileHandler('bibliography_nebius.log'), | |
| logging.StreamHandler() | |
| ] | |
| ) | |
| logger = logging.getLogger(__name__) | |
| # ========== CONFIGURACIÓN NEBIUS API ========== | |
| class NebiusAPI: | |
| """Cliente para API de Nebius AI""" | |
| def __init__(self, api_key: str, base_url: str = "https://api.studio.nebius.com"): | |
| self.api_key = api_key | |
| self.base_url = base_url | |
| self.headers = { | |
| "Authorization": f"Bearer {api_key}", | |
| "Content-Type": "application/json" | |
| } | |
| async def generate_text(self, prompt: str, model: str = "neural-chat-7b-v3-1", | |
| max_tokens: int = 1000, temperature: float = 0.7) -> str: | |
| """Generar texto usando modelos de Nebius""" | |
| url = f"{self.base_url}/v1/chat/completions" | |
| payload = { | |
| "model": model, | |
| "messages": [ | |
| {"role": "user", "content": prompt} | |
| ], | |
| "max_tokens": max_tokens, | |
| "temperature": temperature, | |
| "top_p": 0.95 | |
| } | |
| try: | |
| async with aiohttp.ClientSession() as session: | |
| async with session.post( | |
| url, | |
| headers=self.headers, | |
| json=payload, | |
| timeout=30 | |
| ) as response: | |
| if response.status == 200: | |
| data = await response.json() | |
| return data.get("choices", [{}])[0].get("message", {}).get("content", "") | |
| else: | |
| error_text = await response.text() | |
| logger.error(f"Nebius API error {response.status}: {error_text}") | |
| return "" | |
| except Exception as e: | |
| logger.error(f"Error calling Nebius API: {e}") | |
| return "" | |
| async def extract_references(self, text: str) -> List[Dict[str, Any]]: | |
| """Usar Nebius para extraer referencias de texto""" | |
| prompt = f"""Analiza el siguiente texto y extrae todas las referencias bibliográficas. | |
| Identifica DOIs, ISBNs, URLs académicas, arXiv IDs y otras referencias académicas. | |
| Texto: | |
| {text[:5000]} # Limitar tamaño | |
| Devuelve un JSON con el siguiente formato: | |
| {{ | |
| "references": [ | |
| {{ | |
| "type": "doi|isbn|arxiv|url|pmid|other", | |
| "identifier": "identificador_completo", | |
| "raw_text": "texto_original_encontrado", | |
| "confidence": 0.0-1.0, | |
| "context": "texto_alrededor_del_identificador" | |
| }} | |
| ] | |
| }} | |
| Solo devuelve el JSON, sin texto adicional.""" | |
| response = await self.generate_text(prompt, max_tokens=2000) | |
| try: | |
| # Buscar JSON en la respuesta | |
| json_match = re.search(r'\{.*\}', response, re.DOTALL) | |
| if json_match: | |
| data = json.loads(json_match.group()) | |
| return data.get("references", []) | |
| except Exception as e: | |
| logger.error(f"Error parsing Nebius response: {e}") | |
| return [] | |
| async def verify_reference(self, reference: Dict[str, Any]) -> Dict[str, Any]: | |
| """Verificar una referencia usando Nebius""" | |
| prompt = f"""Verifica la siguiente referencia académica y proporciona información sobre su accesibilidad: | |
| Tipo: {reference.get('type')} | |
| Identificador: {reference.get('identifier')} | |
| Contexto: {reference.get('context', 'No disponible')} | |
| Analiza: | |
| 1. ¿Es un identificador válido? | |
| 2. ¿Dónde podría encontrarse este recurso? | |
| 3. ¿Es probable que esté disponible en acceso abierto? | |
| 4. Proporciona posibles URLs para acceder al recurso. | |
| Devuelve un JSON con el siguiente formato: | |
| {{ | |
| "valid": true/false, | |
| "confidence": 0.0-1.0, | |
| "sources": ["lista", "de", "posibles", "fuentes"], | |
| "likely_open_access": true/false, | |
| "suggested_urls": ["url1", "url2"], | |
| "notes": "notas_adicionales" | |
| }}""" | |
| response = await self.generate_text(prompt, max_tokens=1000) | |
| try: | |
| json_match = re.search(r'\{.*\}', response, re.DOTALL) | |
| if json_match: | |
| return json.loads(json_match.group()) | |
| except Exception as e: | |
| logger.error(f"Error parsing verification response: {e}") | |
| return {"valid": False, "confidence": 0.0, "sources": [], "notes": "Error en verificación"} | |
| # ========== MODELOS DE DATOS ========== | |
| class ResourceType(str, Enum): | |
| DOI = "doi" | |
| ISBN = "isbn" | |
| ARXIV = "arxiv" | |
| URL = "url" | |
| PMID = "pmid" | |
| BIBTEX = "bibtex" | |
| CITATION = "citation" | |
| UNKNOWN = "unknown" | |
| class CitationModel(BaseModel): | |
| id: str | |
| raw_text: str | |
| resource_type: ResourceType | |
| identifier: str | |
| metadata: Dict[str, Any] = Field(default_factory=dict) | |
| confidence: float = 0.0 | |
| extracted_from: str | |
| position: Tuple[int, int] = (0, 0) | |
| nebius_verified: bool = False | |
| nebius_confidence: float = 0.0 | |
| class VerificationResult(BaseModel): | |
| citation: CitationModel | |
| verified: bool | |
| verification_source: str | |
| download_url: Optional[str] | |
| file_format: Optional[str] | |
| file_size: Optional[int] | |
| quality_score: float | |
| notes: List[str] = Field(default_factory=list) | |
| nebius_analysis: Optional[Dict[str, Any]] = None | |
| class ProcessingReport(BaseModel): | |
| input_file: str | |
| total_citations: int | |
| verified_resources: List[VerificationResult] | |
| downloaded_files: List[str] | |
| failed_verifications: List[CitationModel] | |
| processing_time: float | |
| summary: Dict[str, Any] = Field(default_factory=dict) | |
| timestamp: str = Field(default_factory=lambda: datetime.now().isoformat()) | |
| nebius_usage: Dict[str, Any] = Field(default_factory=dict) | |
| # ========== HERRAMIENTAS CON INTEGRACIÓN NEBIUS ========== | |
| class NebiusEnhancedExtractionTool(Tool): | |
| name = "nebius_extract_references" | |
| description = """ | |
| Extract bibliographic references using Nebius AI for enhanced accuracy. | |
| Args: | |
| text (str): Text to analyze | |
| nebius_api_key (str): Nebius API key | |
| use_ai_enhancement (bool): Whether to use Nebius AI for enhancement | |
| Returns: | |
| List[Dict]: Extracted references with Nebius AI analysis | |
| """ | |
| def __init__(self): | |
| super().__init__() | |
| # Patrones básicos para extracción inicial | |
| self.patterns = { | |
| ResourceType.DOI: [ | |
| r'\b10\.\d{4,9}/[-._;()/:A-Z0-9]+\b', | |
| r'doi:\s*(10\.\d{4,9}/[-._;()/:A-Z0-9]+)', | |
| ], | |
| ResourceType.ISBN: [ | |
| r'ISBN(?:-1[03])?:?\s*(?=[0-9X]{10})(?:97[89][- ]?)?[0-9]{1,5}[- ]?[0-9]+[- ]?[0-9]+[- ]?[0-9X]', | |
| ], | |
| ResourceType.ARXIV: [ | |
| r'arXiv:\s*(\d{4}\.\d{4,5}(v\d+)?)', | |
| r'arxiv:\s*([a-z\-]+/\d{7})' | |
| ], | |
| } | |
| def forward(self, text: str, nebius_api_key: str = None, | |
| use_ai_enhancement: bool = False) -> List[Dict[str, Any]]: | |
| """Extraer referencias con opción de mejora con Nebius""" | |
| # Extracción básica | |
| basic_references = self._extract_basic(text) | |
| if not use_ai_enhancement or not nebius_api_key: | |
| return basic_references | |
| # Mejora con Nebius AI | |
| try: | |
| nebius = NebiusAPI(nebius_api_key) | |
| # Usar asyncio en contexto síncrono | |
| import nest_asyncio | |
| nest_asyncio.apply() | |
| # Extraer con Nebius | |
| loop = asyncio.new_event_loop() | |
| asyncio.set_event_loop(loop) | |
| nebius_references = loop.run_until_complete( | |
| nebius.extract_references(text[:10000]) # Limitar para API | |
| ) | |
| loop.close() | |
| # Combinar resultados | |
| enhanced_references = self._merge_references(basic_references, nebius_references) | |
| return enhanced_references | |
| except Exception as e: | |
| logger.error(f"Error using Nebius enhancement: {e}") | |
| return basic_references | |
| def _extract_basic(self, text: str) -> List[Dict[str, Any]]: | |
| """Extracción básica de referencias""" | |
| references = [] | |
| for resource_type, patterns in self.patterns.items(): | |
| for pattern in patterns: | |
| matches = re.finditer(pattern, text, re.IGNORECASE) | |
| for match in matches: | |
| identifier = match.group(1) if match.groups() else match.group(0) | |
| identifier = self._clean_identifier(identifier, resource_type) | |
| if identifier: | |
| reference = { | |
| "id": hashlib.md5(identifier.encode()).hexdigest()[:12], | |
| "raw_text": match.group(0), | |
| "type": resource_type.value, | |
| "identifier": identifier, | |
| "confidence": 0.8, | |
| "context": self._get_context(text, match.start(), match.end()), | |
| "position": (match.start(), match.end()), | |
| "extraction_method": "regex" | |
| } | |
| references.append(reference) | |
| return references | |
| def _merge_references(self, basic: List[Dict], nebius: List[Dict]) -> List[Dict]: | |
| """Combinar referencias de extracción básica y Nebius""" | |
| merged = basic.copy() | |
| for nebius_ref in nebius: | |
| # Verificar si ya existe | |
| exists = False | |
| for ref in merged: | |
| if ref.get('identifier') == nebius_ref.get('identifier'): | |
| exists = True | |
| # Actualizar confianza y metadata | |
| ref['confidence'] = max(ref.get('confidence', 0), | |
| nebius_ref.get('confidence', 0)) | |
| ref['extraction_method'] = 'regex+nebius' | |
| break | |
| if not exists: | |
| # Convertir formato Nebius a nuestro formato | |
| new_ref = { | |
| "id": hashlib.md5( | |
| nebius_ref.get('identifier', '').encode() | |
| ).hexdigest()[:12], | |
| "raw_text": nebius_ref.get('raw_text', ''), | |
| "type": nebius_ref.get('type', 'unknown'), | |
| "identifier": nebius_ref.get('identifier', ''), | |
| "confidence": nebius_ref.get('confidence', 0.7), | |
| "context": nebius_ref.get('context', ''), | |
| "position": (0, 0), | |
| "extraction_method": 'nebius' | |
| } | |
| merged.append(new_ref) | |
| return merged | |
| def _clean_identifier(self, identifier: str, resource_type: ResourceType) -> str: | |
| """Limpiar identificador""" | |
| identifier = identifier.strip() | |
| prefixes = ['doi:', 'DOI:', 'arxiv:', 'arXiv:', 'isbn:', 'ISBN:', 'pmid:', 'PMID:'] | |
| for prefix in prefixes: | |
| if identifier.startswith(prefix): | |
| identifier = identifier[len(prefix):].strip() | |
| identifier = identifier.strip('"\'<>()[]{}') | |
| if resource_type == ResourceType.URL: | |
| if not identifier.startswith(('http://', 'https://')): | |
| identifier = f'https://{identifier}' | |
| return identifier | |
| def _get_context(self, text: str, start: int, end: int, window: int = 100) -> str: | |
| """Obtener contexto alrededor del match""" | |
| context_start = max(0, start - window) | |
| context_end = min(len(text), end + window) | |
| return text[context_start:context_end] | |
| class NebiusVerificationTool(Tool): | |
| name = "nebius_verify_reference" | |
| description = """ | |
| Verify academic references using Nebius AI analysis. | |
| Args: | |
| reference (Dict): Reference to verify | |
| nebius_api_key (str): Nebius API key | |
| deep_verify (bool): Whether to perform deep verification | |
| Returns: | |
| Dict: Verification results with Nebius analysis | |
| """ | |
| def __init__(self): | |
| super().__init__() | |
| self.headers = { | |
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' | |
| } | |
| def forward(self, reference: Dict[str, Any], nebius_api_key: str = None, | |
| deep_verify: bool = False) -> Dict[str, Any]: | |
| """Verificar referencia con Nebius""" | |
| result = { | |
| "reference": reference, | |
| "verified": False, | |
| "verification_source": "direct", | |
| "download_url": None, | |
| "file_format": None, | |
| "file_size": None, | |
| "quality_score": 0.0, | |
| "notes": [], | |
| "nebius_analysis": None | |
| } | |
| # Verificación directa primero | |
| direct_result = self._direct_verification(reference) | |
| if direct_result.get("verified"): | |
| result.update(direct_result) | |
| result["quality_score"] = 0.9 | |
| # Verificación con Nebius si está disponible | |
| if nebius_api_key and deep_verify: | |
| nebius_result = self._nebius_verification(reference, nebius_api_key) | |
| result["nebius_analysis"] = nebius_result | |
| if nebius_result.get("valid", False): | |
| result["verified"] = True | |
| result["verification_source"] = "nebius" | |
| result["quality_score"] = max( | |
| result.get("quality_score", 0), | |
| nebius_result.get("confidence", 0) | |
| ) | |
| # Agregar URLs sugeridas por Nebius | |
| suggested_urls = nebius_result.get("suggested_urls", []) | |
| if suggested_urls and not result.get("download_url"): | |
| result["download_url"] = suggested_urls[0] | |
| result["notes"].append( | |
| f"Nebius analysis: {nebius_result.get('notes', 'No notes')}" | |
| ) | |
| return result | |
| def _direct_verification(self, reference: Dict[str, Any]) -> Dict[str, Any]: | |
| """Verificación directa de la referencia""" | |
| import requests | |
| ref_type = reference.get("type", "") | |
| identifier = reference.get("identifier", "") | |
| try: | |
| if ref_type == "doi": | |
| return self._verify_doi(identifier) | |
| elif ref_type == "arxiv": | |
| return self._verify_arxiv(identifier) | |
| elif ref_type == "url": | |
| return self._verify_url(identifier) | |
| elif ref_type == "isbn": | |
| return self._verify_isbn(identifier) | |
| except Exception as e: | |
| logger.error(f"Direct verification error: {e}") | |
| return {"verified": False, "notes": [f"Direct verification failed for {ref_type}"]} | |
| def _verify_doi(self, doi: str) -> Dict[str, Any]: | |
| """Verificar DOI""" | |
| import requests | |
| try: | |
| # Crossref | |
| url = f"https://api.crossref.org/works/{doi}" | |
| response = requests.get(url, headers=self.headers, timeout=10) | |
| if response.status_code == 200: | |
| data = response.json() | |
| work = data.get('message', {}) | |
| result = {"verified": True, "notes": ["Verified via Crossref"]} | |
| # Buscar PDF | |
| links = work.get('link', []) | |
| for link in links: | |
| if link.get('content-type') == 'application/pdf': | |
| result["download_url"] = link.get('URL') | |
| result["file_format"] = "pdf" | |
| break | |
| return result | |
| except Exception as e: | |
| logger.error(f"DOI verification error: {e}") | |
| return {"verified": False} | |
| def _verify_arxiv(self, arxiv_id: str) -> Dict[str, Any]: | |
| """Verificar arXiv ID""" | |
| import requests | |
| try: | |
| # Limpiar ID | |
| if 'arxiv:' in arxiv_id.lower(): | |
| arxiv_id = arxiv_id.split(':')[-1].strip() | |
| # Verificar existencia | |
| api_url = f"http://export.arxiv.org/api/query?id_list={arxiv_id}" | |
| response = requests.get(api_url, headers=self.headers, timeout=10) | |
| if response.status_code == 200: | |
| return { | |
| "verified": True, | |
| "download_url": f"https://arxiv.org/pdf/{arxiv_id}.pdf", | |
| "file_format": "pdf", | |
| "notes": ["arXiv paper available"] | |
| } | |
| except Exception as e: | |
| logger.error(f"arXiv verification error: {e}") | |
| return {"verified": False} | |
| def _verify_url(self, url: str) -> Dict[str, Any]: | |
| """Verificar URL""" | |
| import requests | |
| try: | |
| response = requests.head(url, headers=self.headers, timeout=10, allow_redirects=True) | |
| if response.status_code == 200: | |
| result = {"verified": True, "notes": [f"URL accessible: {response.status_code}"]} | |
| # Verificar si es PDF | |
| content_type = response.headers.get('content-type', '') | |
| if 'application/pdf' in content_type: | |
| result["download_url"] = url | |
| result["file_format"] = "pdf" | |
| return result | |
| except Exception as e: | |
| logger.error(f"URL verification error: {e}") | |
| return {"verified": False} | |
| def _verify_isbn(self, isbn: str) -> Dict[str, Any]: | |
| """Verificar ISBN""" | |
| import requests | |
| try: | |
| # Open Library | |
| url = f"https://openlibrary.org/api/books?bibkeys=ISBN:{isbn}&format=json" | |
| response = requests.get(url, headers=self.headers, timeout=10) | |
| if response.status_code == 200: | |
| data = response.json() | |
| if data: | |
| return { | |
| "verified": True, | |
| "notes": ["ISBN found in Open Library"] | |
| } | |
| except Exception as e: | |
| logger.error(f"ISBN verification error: {e}") | |
| return {"verified": False} | |
| def _nebius_verification(self, reference: Dict[str, Any], api_key: str) -> Dict[str, Any]: | |
| """Verificación con Nebius AI""" | |
| try: | |
| nebius = NebiusAPI(api_key) | |
| # Usar asyncio en contexto síncrono | |
| import nest_asyncio | |
| nest_asyncio.apply() | |
| loop = asyncio.new_event_loop() | |
| asyncio.set_event_loop(loop) | |
| analysis = loop.run_until_complete( | |
| nebius.verify_reference(reference) | |
| ) | |
| loop.close() | |
| return analysis | |
| except Exception as e: | |
| logger.error(f"Nebius verification error: {e}") | |
| return {"valid": False, "confidence": 0.0, "notes": f"Error: {str(e)}"} | |
| # ========== SISTEMA PRINCIPAL CON NEBIUS ========== | |
| class NebiusBibliographySystem: | |
| """Sistema de procesamiento bibliográfico con Nebius AI""" | |
| def __init__(self, config: Dict[str, Any]): | |
| self.config = config | |
| self.nebius_api_key = config.get("nebius_api_key") | |
| self.use_nebius = bool(self.nebius_api_key) | |
| # Inicializar herramientas | |
| self.extraction_tool = NebiusEnhancedExtractionTool() | |
| self.verification_tool = NebiusVerificationTool() | |
| # Configurar modelo LiteLLM para agentes | |
| self.llm_model = self._configure_llm() | |
| # Directorios de salida | |
| self.output_base = "nebius_bibliography" | |
| self.download_dir = os.path.join(self.output_base, "downloads") | |
| self.report_dir = os.path.join(self.output_base, "reports") | |
| self.log_dir = os.path.join(self.output_base, "logs") | |
| # Crear directorios | |
| for dir_path in [self.output_base, self.download_dir, self.report_dir, self.log_dir]: | |
| os.makedirs(dir_path, exist_ok=True) | |
| # Estadísticas | |
| self.stats = { | |
| "total_processed": 0, | |
| "total_references": 0, | |
| "nebius_calls": 0, | |
| "success_rate": 0.0 | |
| } | |
| logger.info(f"Nebius system initialized. Nebius AI: {'Enabled' if self.use_nebius else 'Disabled'}") | |
| def _configure_llm(self): | |
| """Configurar modelo LiteLLM""" | |
| provider = self.config.get("llm_provider", "openai") | |
| if provider == "nebius" and self.nebius_api_key: | |
| # Configurar Nebius como proveedor personalizado | |
| return LiteLLMModel( | |
| model_id=self.config.get("llm_model", "neural-chat-7b-v3-1"), | |
| api_key=self.nebius_api_key, | |
| api_base=self.config.get("nebius_api_base", "https://api.studio.nebius.com/v1") | |
| ) | |
| elif provider == "openai": | |
| return LiteLLMModel( | |
| model_id=self.config.get("llm_model", "gpt-4"), | |
| api_key=self.config.get("openai_api_key") | |
| ) | |
| else: | |
| # Default to Nebius if available | |
| if self.nebius_api_key: | |
| return LiteLLMModel( | |
| model_id="neural-chat-7b-v3-1", | |
| api_key=self.nebius_api_key, | |
| api_base="https://api.studio.nebius.com/v1" | |
| ) | |
| else: | |
| return LiteLLMModel(model_id="gpt-4") | |
| async def process_document(self, file_path: str, process_id: str = None) -> Dict[str, Any]: | |
| """Procesar documento completo con Nebius""" | |
| import time | |
| start_time = time.time() | |
| # Generar ID de proceso | |
| process_id = process_id or self._generate_process_id(file_path) | |
| logger.info(f"[{process_id}] Processing document: {file_path}") | |
| try: | |
| # 1. Leer archivo | |
| file_content = self._read_file(file_path) | |
| if not file_content: | |
| return self._error_result(process_id, "Empty or unreadable file") | |
| # 2. Extraer referencias | |
| logger.info(f"[{process_id}] Extracting references...") | |
| references = self.extraction_tool.forward( | |
| text=file_content, | |
| nebius_api_key=self.nebius_api_key, | |
| use_ai_enhancement=self.use_nebius | |
| ) | |
| if self.use_nebius: | |
| self.stats["nebius_calls"] += 1 | |
| self.stats["total_references"] += len(references) | |
| logger.info(f"[{process_id}] Found {len(references)} references") | |
| # 3. Verificar referencias | |
| logger.info(f"[{process_id}] Verifying references...") | |
| verification_results = [] | |
| failed_verifications = [] | |
| for i, ref in enumerate(references): | |
| if i % 5 == 0: # Log cada 5 referencias | |
| logger.info(f"[{process_id}] Verified {i}/{len(references)}") | |
| # Verificar referencia | |
| verification = self.verification_tool.forward( | |
| reference=ref, | |
| nebius_api_key=self.nebius_api_key, | |
| deep_verify=self.use_nebius | |
| ) | |
| if verification.get("verified"): | |
| # Convertir a modelo | |
| citation = CitationModel( | |
| id=ref.get("id"), | |
| raw_text=ref.get("raw_text", ""), | |
| resource_type=ResourceType(ref.get("type", "unknown")), | |
| identifier=ref.get("identifier", ""), | |
| confidence=ref.get("confidence", 0.0), | |
| extracted_from=file_path, | |
| position=ref.get("position", (0, 0)), | |
| nebius_verified=self.use_nebius, | |
| nebius_confidence=verification.get("quality_score", 0.0) | |
| ) | |
| vr = VerificationResult( | |
| citation=citation, | |
| verified=True, | |
| verification_source=verification.get("verification_source", "unknown"), | |
| download_url=verification.get("download_url"), | |
| file_format=verification.get("file_format"), | |
| file_size=verification.get("file_size"), | |
| quality_score=verification.get("quality_score", 0.0), | |
| notes=verification.get("notes", []), | |
| nebius_analysis=verification.get("nebius_analysis") | |
| ) | |
| verification_results.append(vr) | |
| else: | |
| # Referencia fallida | |
| citation = CitationModel( | |
| id=ref.get("id"), | |
| raw_text=ref.get("raw_text", ""), | |
| resource_type=ResourceType(ref.get("type", "unknown")), | |
| identifier=ref.get("identifier", ""), | |
| confidence=ref.get("confidence", 0.0), | |
| extracted_from=file_path, | |
| position=ref.get("position", (0, 0)), | |
| nebius_verified=False, | |
| nebius_confidence=0.0 | |
| ) | |
| failed_verifications.append(citation) | |
| # 4. Descargar archivos verificados | |
| logger.info(f"[{process_id}] Downloading files...") | |
| downloaded_files = await self._download_files( | |
| verification_results, | |
| process_id | |
| ) | |
| # 5. Generar reporte | |
| processing_time = time.time() - start_time | |
| report = ProcessingReport( | |
| input_file=file_path, | |
| total_citations=len(references), | |
| verified_resources=verification_results, | |
| downloaded_files=downloaded_files, | |
| failed_verifications=failed_verifications, | |
| processing_time=processing_time, | |
| summary={ | |
| "success_rate": len(verification_results) / max(1, len(references)), | |
| "download_rate": len(downloaded_files) / max(1, len(verification_results)), | |
| "avg_quality": sum(vr.quality_score for vr in verification_results) / max(1, len(verification_results)) | |
| }, | |
| nebius_usage={ | |
| "enabled": self.use_nebius, | |
| "calls": self.stats["nebius_calls"], | |
| "enhanced_references": sum(1 for vr in verification_results if vr.nebius_analysis) | |
| } | |
| ) | |
| # 6. Guardar resultados | |
| self._save_results(report, process_id) | |
| self.stats["total_processed"] += 1 | |
| self.stats["success_rate"] = report.summary.get("success_rate", 0.0) | |
| logger.info(f"[{process_id}] Processing completed in {processing_time:.2f}s") | |
| return { | |
| "success": True, | |
| "process_id": process_id, | |
| "report": report.dict(), | |
| "zip_path": self._create_zip(report, process_id), | |
| "summary": { | |
| "references_found": len(references), | |
| "verified": len(verification_results), | |
| "downloaded": len(downloaded_files), | |
| "success_rate": f"{report.summary.get('success_rate', 0) * 100:.1f}%", | |
| "processing_time": f"{processing_time:.2f}s" | |
| } | |
| } | |
| except Exception as e: | |
| logger.error(f"[{process_id}] Processing error: {e}") | |
| return self._error_result(process_id, str(e)) | |
| def _read_file(self, file_path: str) -> str: | |
| """Leer contenido del archivo""" | |
| try: | |
| with open(file_path, 'r', encoding='utf-8', errors='ignore') as f: | |
| return f.read() | |
| except Exception as e: | |
| logger.error(f"Error reading file {file_path}: {e}") | |
| return "" | |
| async def _download_files(self, verification_results: List[VerificationResult], | |
| process_id: str) -> List[str]: | |
| """Descargar archivos de URLs verificadas""" | |
| downloaded_files = [] | |
| for i, vr in enumerate(verification_results): | |
| if vr.download_url: | |
| try: | |
| file_path = await self._download_file( | |
| vr.download_url, | |
| vr.citation.identifier, | |
| process_id, | |
| i | |
| ) | |
| if file_path: | |
| downloaded_files.append(file_path) | |
| except Exception as e: | |
| logger.error(f"Download failed for {vr.citation.identifier}: {e}") | |
| return downloaded_files | |
| async def _download_file(self, url: str, identifier: str, | |
| process_id: str, index: int) -> Optional[str]: | |
| """Descargar un archivo individual""" | |
| import aiohttp | |
| try: | |
| # Crear nombre de archivo seguro | |
| safe_name = re.sub(r'[^\w\-\.]', '_', identifier) | |
| if len(safe_name) > 100: | |
| safe_name = safe_name[:100] | |
| # Determinar extensión | |
| extension = self._get_extension_from_url(url) | |
| if not extension: | |
| extension = ".pdf" # Default | |
| filename = f"{process_id}_{index:03d}_{safe_name}{extension}" | |
| filepath = os.path.join(self.download_dir, filename) | |
| # Descargar | |
| timeout = aiohttp.ClientTimeout(total=60) | |
| async with aiohttp.ClientSession(timeout=timeout) as session: | |
| async with session.get(url, headers={'User-Agent': 'Mozilla/5.0'}) as response: | |
| if response.status == 200: | |
| content = await response.read() | |
| # Verificar que sea un archivo válido | |
| if len(content) > 100: # Archivo no vacío | |
| with open(filepath, 'wb') as f: | |
| f.write(content) | |
| logger.info(f"Downloaded: {filename} ({len(content)} bytes)") | |
| return filepath | |
| return None | |
| except Exception as e: | |
| logger.error(f"Download error for {url}: {e}") | |
| return None | |
| def _get_extension_from_url(self, url: str) -> str: | |
| """Obtener extensión de archivo desde URL""" | |
| url_lower = url.lower() | |
| if '.pdf' in url_lower: | |
| return '.pdf' | |
| elif '.docx' in url_lower or '.doc' in url_lower: | |
| return '.docx' | |
| elif '.html' in url_lower or '.htm' in url_lower: | |
| return '.html' | |
| elif '.txt' in url_lower: | |
| return '.txt' | |
| elif '.epub' in url_lower: | |
| return '.epub' | |
| return "" | |
| def _generate_process_id(self, file_path: str) -> str: | |
| """Generar ID único de proceso""" | |
| timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") | |
| file_hash = hashlib.md5(file_path.encode()).hexdigest()[:6] | |
| return f"NB_{timestamp}_{file_hash}" | |
| def _save_results(self, report: ProcessingReport, process_id: str): | |
| """Guardar resultados en disco""" | |
| # Guardar reporte JSON | |
| report_path = os.path.join(self.report_dir, f"{process_id}_report.json") | |
| with open(report_path, 'w', encoding='utf-8') as f: | |
| json.dump(report.dict(), f, indent=2, default=str) | |
| # Guardar resumen en texto | |
| summary_path = os.path.join(self.report_dir, f"{process_id}_summary.txt") | |
| with open(summary_path, 'w', encoding='utf-8') as f: | |
| f.write(self._generate_text_summary(report)) | |
| def _create_zip(self, report: ProcessingReport, process_id: str) -> str: | |
| """Crear archivo ZIP con resultados""" | |
| import zipfile | |
| zip_path = os.path.join(self.output_base, f"{process_id}_results.zip") | |
| with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf: | |
| # Agregar reportes | |
| report_files = [ | |
| f for f in os.listdir(self.report_dir) | |
| if f.startswith(process_id) | |
| ] | |
| for file in report_files: | |
| filepath = os.path.join(self.report_dir, file) | |
| zipf.write(filepath, f"reports/{file}") | |
| # Agregar archivos descargados | |
| for file_path in report.downloaded_files: | |
| if os.path.exists(file_path): | |
| filename = os.path.basename(file_path) | |
| zipf.write(file_path, f"downloads/{filename}") | |
| # Agregar log | |
| log_path = os.path.join(self.log_dir, f"{process_id}_log.txt") | |
| with open(log_path, 'w') as f: | |
| f.write(f"Process ID: {process_id}\n") | |
| f.write(f"Time: {datetime.now().isoformat()}\n") | |
| f.write(f"Success rate: {report.summary.get('success_rate', 0) * 100:.1f}%\n") | |
| zipf.write(log_path, "process_log.txt") | |
| return zip_path | |
| def _generate_text_summary(self, report: ProcessingReport) -> str: | |
| """Generar resumen en texto""" | |
| summary = f""" | |
| NEBIUS BIBLIOGRAPHY PROCESSING REPORT | |
| ===================================== | |
| Process ID: Generated automatically | |
| Input File: {report.input_file} | |
| Processing Time: {report.processing_time:.2f} seconds | |
| Timestamp: {report.timestamp} | |
| SUMMARY STATISTICS | |
| ------------------ | |
| Total References Found: {report.total_citations} | |
| Successfully Verified: {len(report.verified_resources)} | |
| Files Downloaded: {len(report.downloaded_files)} | |
| Verification Success Rate: {report.summary.get('success_rate', 0) * 100:.1f}% | |
| Average Quality Score: {report.summary.get('avg_quality', 0):.2f} | |
| NEBIUS AI USAGE | |
| --------------- | |
| Enabled: {report.nebius_usage.get('enabled', False)} | |
| API Calls: {report.nebius_usage.get('calls', 0)} | |
| Enhanced References: {report.nebius_usage.get('enhanced_references', 0)} | |
| VERIFIED RESOURCES (Top 10) | |
| --------------------------- | |
| """ | |
| for i, vr in enumerate(report.verified_resources[:10], 1): | |
| summary += f"\n{i}. {vr.citation.identifier}" | |
| summary += f"\n Type: {vr.citation.resource_type.value}" | |
| summary += f"\n Source: {vr.verification_source}" | |
| summary += f"\n Quality: {vr.quality_score:.2f}" | |
| summary += f"\n Nebius Enhanced: {vr.citation.nebius_verified}" | |
| if vr.download_url: | |
| summary += f"\n Downloaded: Yes" | |
| summary += "\n" | |
| if report.failed_verifications: | |
| summary += f"\nFAILED VERIFICATIONS ({len(report.failed_verifications)})\n" | |
| summary += "-" * 40 + "\n" | |
| for citation in report.failed_verifications[:5]: | |
| summary += f"- {citation.identifier} ({citation.resource_type.value})\n" | |
| summary += f"\nFILES DOWNLOADED\n" | |
| summary += "-" * 40 + "\n" | |
| for file_path in report.downloaded_files: | |
| if os.path.exists(file_path): | |
| file_size = os.path.getsize(file_path) | |
| summary += f"- {os.path.basename(file_path)} ({file_size} bytes)\n" | |
| return summary | |
| def _error_result(self, process_id: str, error: str) -> Dict[str, Any]: | |
| """Generar resultado de error""" | |
| return { | |
| "success": False, | |
| "process_id": process_id, | |
| "error": error, | |
| "timestamp": datetime.now().isoformat() | |
| } | |
| def get_stats(self) -> Dict[str, Any]: | |
| """Obtener estadísticas del sistema""" | |
| return { | |
| "total_processed": self.stats["total_processed"], | |
| "total_references": self.stats["total_references"], | |
| "nebius_calls": self.stats["nebius_calls"], | |
| "success_rate": self.stats["success_rate"], | |
| "output_directory": self.output_base | |
| } | |
| # ========== INTERFAZ GRADIO MEJORADA ========== | |
| def create_nebius_interface(): | |
| """Crear interfaz Gradio con soporte para Nebius""" | |
| system = None | |
| current_process = None | |
| def initialize_system(provider, model, nebius_key, nebius_base, openai_key): | |
| """Inicializar sistema con configuración""" | |
| nonlocal system | |
| config = { | |
| "llm_provider": provider, | |
| "llm_model": model, | |
| "nebius_api_key": nebius_key, | |
| "nebius_api_base": nebius_base or "https://api.studio.nebius.com/v1", | |
| "openai_api_key": openai_key, | |
| "use_nebius": bool(nebius_key) | |
| } | |
| try: | |
| system = NebiusBibliographySystem(config) | |
| return "✅ Sistema inicializado con Nebius AI" if nebius_key else "✅ Sistema inicializado (sin Nebius)" | |
| except Exception as e: | |
| return f"❌ Error: {str(e)}" | |
| async def process_document(file_obj, use_nebius, progress=gr.Progress()): | |
| """Procesar documento""" | |
| nonlocal system, current_process | |
| if not system: | |
| return None, "❌ Sistema no inicializado", "", "", "" | |
| try: | |
| progress(0, desc="Preparando archivo...") | |
| # Guardar archivo temporalmente | |
| import tempfile | |
| import shutil | |
| temp_dir = tempfile.mkdtemp() | |
| file_path = os.path.join(temp_dir, file_obj.name) | |
| shutil.copy(file_obj.name, file_path) | |
| progress(0.1, desc="Procesando con Nebius..." if use_nebius else "Procesando...") | |
| # Procesar documento | |
| result = await system.process_document(file_path) | |
| if not result.get("success"): | |
| # Limpiar temporal | |
| shutil.rmtree(temp_dir, ignore_errors=True) | |
| return None, f"❌ Error: {result.get('error')}", "", "", "" | |
| current_process = result.get("process_id") | |
| summary = result.get("summary", {}) | |
| progress(0.9, desc="Generando reportes...") | |
| # Generar visualizaciones | |
| report_data = result.get("report", {}) | |
| # HTML output | |
| html_output = self._generate_html_report(report_data) | |
| # Text output | |
| text_output = self._generate_text_report(report_data) | |
| # JSON output | |
| json_output = json.dumps(report_data, indent=2, default=str) | |
| # Statistics | |
| stats_output = self._generate_stats_display(summary) | |
| progress(1.0, desc="Completado!") | |
| # Limpiar temporal | |
| shutil.rmtree(temp_dir, ignore_errors=True) | |
| return ( | |
| result.get("zip_path"), | |
| f"✅ Proceso {current_process} completado", | |
| html_output, | |
| text_output, | |
| json_output, | |
| stats_output | |
| ) | |
| except Exception as e: | |
| logger.error(f"Processing error: {e}") | |
| return None, f"❌ Error: {str(e)}", "", "", "", "" | |
| def _generate_html_report(self, report_data: Dict) -> str: | |
| """Generar reporte HTML""" | |
| verified = len(report_data.get("verified_resources", [])) | |
| total = report_data.get("total_citations", 0) | |
| success_rate = (verified / max(1, total)) * 100 | |
| nebius_usage = report_data.get("nebius_usage", {}) | |
| html = f""" | |
| <div style="font-family: Arial, sans-serif; padding: 20px;"> | |
| <h2 style="color: #2c3e50;">📊 Reporte de Procesamiento Nebius</h2> | |
| <div style="background: #ecf0f1; padding: 15px; border-radius: 10px; margin: 15px 0;"> | |
| <h3 style="color: #34495e;">📈 Resumen General</h3> | |
| <div style="display: grid; grid-template-columns: repeat(2, 1fr); gap: 10px;"> | |
| <div style="background: white; padding: 10px; border-radius: 5px;"> | |
| <strong>Referencias Encontradas</strong><br> | |
| <span style="font-size: 24px; color: #3498db;">{total}</span> | |
| </div> | |
| <div style="background: white; padding: 10px; border-radius: 5px;"> | |
| <strong>Verificadas</strong><br> | |
| <span style="font-size: 24px; color: #2ecc71;">{verified}</span> | |
| </div> | |
| <div style="background: white; padding: 10px; border-radius: 5px;"> | |
| <strong>Tasa de Éxito</strong><br> | |
| <span style="font-size: 24px; color: #9b59b6;">{success_rate:.1f}%</span> | |
| </div> | |
| <div style="background: white; padding: 10px; border-radius: 5px;"> | |
| <strong>Tiempo</strong><br> | |
| <span style="font-size: 24px; color: #e74c3c;">{report_data.get('processing_time', 0):.1f}s</span> | |
| </div> | |
| </div> | |
| </div> | |
| <div style="background: #d5f4e6; padding: 15px; border-radius: 10px; margin: 15px 0;"> | |
| <h3 style="color: #27ae60;">🤖 Nebius AI</h3> | |
| <p><strong>Estado:</strong> {'✅ Activado' if nebius_usage.get('enabled') else '❌ Desactivado'}</p> | |
| <p><strong>Llamadas API:</strong> {nebius_usage.get('calls', 0)}</p> | |
| <p><strong>Referencias Mejoradas:</strong> {nebius_usage.get('enhanced_references', 0)}</p> | |
| </div> | |
| <div style="background: #e8f4fc; padding: 15px; border-radius: 10px; margin: 15px 0;"> | |
| <h3 style="color: #2980b9;">📥 Descargas</h3> | |
| <p><strong>Archivos Descargados:</strong> {len(report_data.get('downloaded_files', []))}</p> | |
| <ul> | |
| """ | |
| for file in report_data.get("downloaded_files", [])[:5]: | |
| filename = os.path.basename(file) | |
| html += f'<li>{filename}</li>' | |
| html += """ | |
| </ul> | |
| </div> | |
| <div style="background: #fdebd0; padding: 15px; border-radius: 10px; margin: 15px 0;"> | |
| <h3 style="color: #d35400;">⚠️ Referencias No Verificadas</h3> | |
| <p><strong>Total:</strong> {failed}</p> | |
| """.format(failed=len(report_data.get("failed_verifications", []))) | |
| html += """ | |
| </div> | |
| </div> | |
| """ | |
| return html | |
| def _generate_text_report(self, report_data: Dict) -> str: | |
| """Generar reporte en texto""" | |
| verified = len(report_data.get("verified_resources", [])) | |
| total = report_data.get("total_citations", 0) | |
| text = f""" | |
| REPORTE DE PROCESAMIENTO | |
| ======================== | |
| Archivo: {report_data.get('input_file', 'Desconocido')} | |
| Fecha: {report_data.get('timestamp', '')} | |
| ESTADÍSTICAS: | |
| ------------- | |
| • Referencias encontradas: {total} | |
| • Referencias verificadas: {verified} | |
| • Archivos descargados: {len(report_data.get('downloaded_files', []))} | |
| • Tiempo de procesamiento: {report_data.get('processing_time', 0):.2f}s | |
| • Tasa de éxito: {(verified/max(1, total))*100:.1f}% | |
| NEBIUS AI: | |
| ---------- | |
| • Estado: {'Activado' if report_data.get('nebius_usage', {}).get('enabled') else 'Desactivado'} | |
| • Llamadas API: {report_data.get('nebius_usage', {}).get('calls', 0)} | |
| • Referencias mejoradas: {report_data.get('nebius_usage', {}).get('enhanced_references', 0)} | |
| Para más detalles, consulte el archivo ZIP con el reporte completo. | |
| """ | |
| return text | |
| def _generate_stats_display(self, summary: Dict) -> str: | |
| """Generar display de estadísticas""" | |
| return f""" | |
| ⚡ PROCESO COMPLETADO ⚡ | |
| 📊 Estadísticas Rápidas: | |
| • Referencias: {summary.get('references_found', 0)} | |
| • Verificadas: {summary.get('verified', 0)} | |
| • Descargadas: {summary.get('downloaded', 0)} | |
| • Tasa de éxito: {summary.get('success_rate', '0%')} | |
| • Tiempo: {summary.get('processing_time', '0s')} | |
| """ | |
| def get_system_stats(): | |
| """Obtener estadísticas del sistema""" | |
| nonlocal system | |
| if not system: | |
| return "❌ Sistema no inicializado" | |
| stats = system.get_stats() | |
| return f""" | |
| 📈 Estadísticas del Sistema Nebius: | |
| • Documentos procesados: {stats.get('total_processed', 0)} | |
| • Referencias totales: {stats.get('total_references', 0)} | |
| • Llamadas Nebius API: {stats.get('nebius_calls', 0)} | |
| • Tasa de éxito promedio: {stats.get('success_rate', 0) * 100:.1f}% | |
| • Directorio de salida: {stats.get('output_directory', 'N/A')} | |
| """ | |
| # Crear interfaz | |
| with gr.Blocks(title="Nebius Bibliography System", theme=gr.themes.Soft()) as interface: | |
| gr.Markdown("# 📚 Sistema de Recopilación Bibliográfica con Nebius AI") | |
| gr.Markdown("Procesa documentos académicos usando Nebius AI para extracción y verificación inteligente") | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| gr.Markdown("### ⚙️ Configuración Nebius AI") | |
| provider = gr.Dropdown( | |
| choices=["nebius", "openai"], | |
| label="Proveedor de IA Principal", | |
| value="nebius", | |
| info="Selecciona Nebius para usar la API de Nebius AI" | |
| ) | |
| model = gr.Textbox( | |
| label="Modelo", | |
| value="neural-chat-7b-v3-1", | |
| placeholder="Modelo de Nebius (ej: neural-chat-7b-v3-1)" | |
| ) | |
| nebius_key = gr.Textbox( | |
| label="Nebius API Key", | |
| type="password", | |
| placeholder="Ingresa tu API Key de Nebius" | |
| ) | |
| nebius_base = gr.Textbox( | |
| label="Nebius API Base (opcional)", | |
| value="https://api.studio.nebius.com/v1", | |
| placeholder="URL base de la API de Nebius" | |
| ) | |
| openai_key = gr.Textbox( | |
| label="OpenAI API Key (respaldo)", | |
| type="password", | |
| placeholder="Opcional: Key de OpenAI como respaldo" | |
| ) | |
| init_btn = gr.Button("🚀 Inicializar Sistema Nebius", variant="primary") | |
| init_status = gr.Markdown("") | |
| gr.Markdown("---") | |
| stats_btn = gr.Button("📊 Estadísticas del Sistema") | |
| system_stats = gr.Markdown("") | |
| with gr.Column(scale=2): | |
| gr.Markdown("### 📄 Procesar Documento") | |
| file_input = gr.File( | |
| label="Sube tu documento", | |
| file_types=[".txt", ".pdf", ".docx", ".html", ".md"] | |
| ) | |
| use_nebius = gr.Checkbox( | |
| label="Usar Nebius AI para mejora de precisión", | |
| value=True | |
| ) | |
| process_btn = gr.Button("🔍 Procesar con Nebius AI", variant="primary") | |
| gr.Markdown("### 📦 Resultados") | |
| result_file = gr.File(label="Descargar Paquete Completo (ZIP)") | |
| result_status = gr.Markdown("") | |
| stats_display = gr.Markdown("") | |
| with gr.Tabs(): | |
| with gr.TabItem("📋 Vista HTML"): | |
| html_output = gr.HTML(label="Reporte Interactivo") | |
| with gr.TabItem("📝 Texto Plano"): | |
| text_output = gr.Textbox( | |
| label="Resumen", | |
| lines=15, | |
| max_lines=30 | |
| ) | |
| with gr.TabItem("🔧 JSON Completo"): | |
| json_output = gr.Code( | |
| label="Datos Completos", | |
| language="json", | |
| lines=20 | |
| ) | |
| # Conectar eventos | |
| init_btn.click( | |
| initialize_system, | |
| inputs=[provider, model, nebius_key, nebius_base, openai_key], | |
| outputs=init_status | |
| ) | |
| process_btn.click( | |
| process_document, | |
| inputs=[file_input, use_nebius], | |
| outputs=[result_file, result_status, html_output, text_output, json_output, stats_display] | |
| ) | |
| stats_btn.click( | |
| get_system_stats, | |
| outputs=system_stats | |
| ) | |
| # Información | |
| gr.Markdown(""" | |
| ### 📌 Características Nebius AI | |
| **🔍 Extracción Inteligente:** | |
| - Identificación contextual de referencias | |
| - Corrección automática de identificadores | |
| - Clasificación por tipo de recurso | |
| **✅ Verificación Avanzada:** | |
| - Análisis de accesibilidad | |
| - Detección de acceso abierto | |
| - Sugerencias de fuentes alternativas | |
| **📊 Reportes Mejorados:** | |
| - Métricas de confianza Nebius | |
| - Análisis de calidad por referencia | |
| - Estadísticas de uso de IA | |
| ### ⚠️ Notas Importantes | |
| 1. La API de Nebius requiere una key válida | |
| 2. Los archivos grandes pueden consumir más tokens | |
| 3. Se recomienda usar Nebius para máxima precisión | |
| 4. Mantén tu API key segura y no la compartas | |
| ### 🔗 Recursos | |
| • [Documentación Nebius AI](https://docs.nebius.com) | |
| • [Obtener API Key](https://studio.nebius.com) | |
| • [Soporte Técnico](https://support.nebius.com) | |
| """) | |
| return interface | |
| # ========== EJECUCIÓN PRINCIPAL ========== | |
| async def main(): | |
| """Función principal""" | |
| import argparse | |
| parser = argparse.ArgumentParser(description="Sistema Nebius de Recopilación Bibliográfica") | |
| parser.add_argument("--mode", choices=["gui", "cli"], default="gui", | |
| help="Modo de ejecución") | |
| parser.add_argument("--file", type=str, help="Archivo a procesar (modo CLI)") | |
| parser.add_argument("--nebius-key", help="API Key de Nebius") | |
| parser.add_argument("--model", default="neural-chat-7b-v3-1", help="Modelo Nebius") | |
| parser.add_argument("--api-base", default="https://api.studio.nebius.com/v1", | |
| help="URL base de Nebius API") | |
| args = parser.parse_args() | |
| if args.mode == "gui": | |
| # Ejecutar interfaz Gradio | |
| interface = create_nebius_interface() | |
| interface.launch( | |
| server_name="0.0.0.0", | |
| server_port=7860, | |
| share=True, | |
| debug=True | |
| ) | |
| elif args.mode == "cli": | |
| # Modo línea de comandos | |
| if not args.file: | |
| print("❌ Error: Debes especificar un archivo con --file") | |
| return | |
| if not os.path.exists(args.file): | |
| print(f"❌ Error: Archivo no encontrado: {args.file}") | |
| return | |
| if not args.nebius_key: | |
| print("⚠️ Advertencia: No se proporcionó API Key de Nebius") | |
| use_nebius = False | |
| nebius_key = None | |
| else: | |
| use_nebius = True | |
| nebius_key = args.nebius_key | |
| # Configurar sistema | |
| config = { | |
| "llm_provider": "nebius" if use_nebius else "openai", | |
| "llm_model": args.model, | |
| "nebius_api_key": nebius_key, | |
| "nebius_api_base": args.api_base, | |
| "use_nebius": use_nebius | |
| } | |
| system = NebiusBibliographySystem(config) | |
| print(f"🔍 Procesando archivo: {args.file}") | |
| print(f"🤖 Nebius AI: {'Activado' if use_nebius else 'Desactivado'}") | |
| print("⏳ Procesando...") | |
| result = await system.process_document(args.file) | |
| if result.get("success"): | |
| print(f"✅ Procesamiento completado!") | |
| print(f"📊 ID del proceso: {result.get('process_id')}") | |
| summary = result.get("summary", {}) | |
| print(f""" | |
| 📈 Resultados: | |
| - Referencias encontradas: {summary.get('references_found', 0)} | |
| - Referencias verificadas: {summary.get('verified', 0)} | |
| - Archivos descargados: {summary.get('downloaded', 0)} | |
| - Tasa de éxito: {summary.get('success_rate', '0%')} | |
| - Tiempo de procesamiento: {summary.get('processing_time', '0s')} | |
| 📦 Paquete de resultados: {result.get('zip_path')} | |
| 📊 Estadísticas Nebius: | |
| - Llamadas API: {result.get('report', {}).get('nebius_usage', {}).get('calls', 0)} | |
| - Referencias mejoradas: {result.get('report', {}).get('nebius_usage', {}).get('enhanced_references', 0)} | |
| """) | |
| else: | |
| print(f"❌ Error: {result.get('error')}") | |
| if __name__ == "__main__": | |
| import asyncio | |
| asyncio.run(main()) |