diff --git "a/app.py" "b/app.py" --- "a/app.py" +++ "b/app.py" @@ -13,6 +13,7 @@ import gradio as gr from enum import Enum import hashlib import urllib.parse +import aiohttp # Importar smolagents from smolagents import CodeAgent, ToolCallingAgent, LiteLLMModel @@ -24,12 +25,130 @@ logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', handlers=[ - logging.FileHandler('bibliography_system.log'), + logging.FileHandler('bibliography_nebius.log'), logging.StreamHandler() ] ) logger = logging.getLogger(__name__) +# ========== CONFIGURACIÓN NEBIUS API ========== + +class NebiusAPI: + """Cliente para API de Nebius AI""" + + def __init__(self, api_key: str, base_url: str = "https://api.studio.nebius.com"): + self.api_key = api_key + self.base_url = base_url + self.headers = { + "Authorization": f"Bearer {api_key}", + "Content-Type": "application/json" + } + + async def generate_text(self, prompt: str, model: str = "neural-chat-7b-v3-1", + max_tokens: int = 1000, temperature: float = 0.7) -> str: + """Generar texto usando modelos de Nebius""" + url = f"{self.base_url}/v1/chat/completions" + + payload = { + "model": model, + "messages": [ + {"role": "user", "content": prompt} + ], + "max_tokens": max_tokens, + "temperature": temperature, + "top_p": 0.95 + } + + try: + async with aiohttp.ClientSession() as session: + async with session.post( + url, + headers=self.headers, + json=payload, + timeout=30 + ) as response: + if response.status == 200: + data = await response.json() + return data.get("choices", [{}])[0].get("message", {}).get("content", "") + else: + error_text = await response.text() + logger.error(f"Nebius API error {response.status}: {error_text}") + return "" + except Exception as e: + logger.error(f"Error calling Nebius API: {e}") + return "" + + async def extract_references(self, text: str) -> List[Dict[str, Any]]: + """Usar Nebius para extraer referencias de texto""" + prompt = f"""Analiza el siguiente texto y extrae todas las referencias bibliográficas. + Identifica DOIs, ISBNs, URLs académicas, arXiv IDs y otras referencias académicas. + + Texto: + {text[:5000]} # Limitar tamaño + + Devuelve un JSON con el siguiente formato: + {{ + "references": [ + {{ + "type": "doi|isbn|arxiv|url|pmid|other", + "identifier": "identificador_completo", + "raw_text": "texto_original_encontrado", + "confidence": 0.0-1.0, + "context": "texto_alrededor_del_identificador" + }} + ] + }} + + Solo devuelve el JSON, sin texto adicional.""" + + response = await self.generate_text(prompt, max_tokens=2000) + + try: + # Buscar JSON en la respuesta + json_match = re.search(r'\{.*\}', response, re.DOTALL) + if json_match: + data = json.loads(json_match.group()) + return data.get("references", []) + except Exception as e: + logger.error(f"Error parsing Nebius response: {e}") + + return [] + + async def verify_reference(self, reference: Dict[str, Any]) -> Dict[str, Any]: + """Verificar una referencia usando Nebius""" + prompt = f"""Verifica la siguiente referencia académica y proporciona información sobre su accesibilidad: + + Tipo: {reference.get('type')} + Identificador: {reference.get('identifier')} + Contexto: {reference.get('context', 'No disponible')} + + Analiza: + 1. ¿Es un identificador válido? + 2. ¿Dónde podría encontrarse este recurso? + 3. ¿Es probable que esté disponible en acceso abierto? + 4. Proporciona posibles URLs para acceder al recurso. + + Devuelve un JSON con el siguiente formato: + {{ + "valid": true/false, + "confidence": 0.0-1.0, + "sources": ["lista", "de", "posibles", "fuentes"], + "likely_open_access": true/false, + "suggested_urls": ["url1", "url2"], + "notes": "notas_adicionales" + }}""" + + response = await self.generate_text(prompt, max_tokens=1000) + + try: + json_match = re.search(r'\{.*\}', response, re.DOTALL) + if json_match: + return json.loads(json_match.group()) + except Exception as e: + logger.error(f"Error parsing verification response: {e}") + + return {"valid": False, "confidence": 0.0, "sources": [], "notes": "Error en verificación"} + # ========== MODELOS DE DATOS ========== class ResourceType(str, Enum): @@ -51,6 +170,8 @@ class CitationModel(BaseModel): confidence: float = 0.0 extracted_from: str position: Tuple[int, int] = (0, 0) + nebius_verified: bool = False + nebius_confidence: float = 0.0 class VerificationResult(BaseModel): citation: CitationModel @@ -61,6 +182,7 @@ class VerificationResult(BaseModel): file_size: Optional[int] quality_score: float notes: List[str] = Field(default_factory=list) + nebius_analysis: Optional[Dict[str, Any]] = None class ProcessingReport(BaseModel): input_file: str @@ -71,172 +193,169 @@ class ProcessingReport(BaseModel): processing_time: float summary: Dict[str, Any] = Field(default_factory=dict) timestamp: str = Field(default_factory=lambda: datetime.now().isoformat()) + nebius_usage: Dict[str, Any] = Field(default_factory=dict) -# ========== HERRAMIENTAS PARA AGENTES ========== +# ========== HERRAMIENTAS CON INTEGRACIÓN NEBIUS ========== -class BibliographyExtractionTool(Tool): - name = "extract_bibliography" +class NebiusEnhancedExtractionTool(Tool): + name = "nebius_extract_references" description = """ - Extract bibliographic references from text. Identifies DOIs, ISBNs, arXiv IDs, URLs, - and other academic identifiers from unstructured text. + Extract bibliographic references using Nebius AI for enhanced accuracy. Args: - text (str): The text to analyze - source_name (str): Name of the source document + text (str): Text to analyze + nebius_api_key (str): Nebius API key + use_ai_enhancement (bool): Whether to use Nebius AI for enhancement Returns: - List[CitationModel]: List of extracted citations + List[Dict]: Extracted references with Nebius AI analysis """ def __init__(self): super().__init__() - # Patrones para diferentes tipos de recursos + # Patrones básicos para extracción inicial self.patterns = { ResourceType.DOI: [ r'\b10\.\d{4,9}/[-._;()/:A-Z0-9]+\b', r'doi:\s*(10\.\d{4,9}/[-._;()/:A-Z0-9]+)', - r'DOI:\s*(10\.\d{4,9}/[-._;()/:A-Z0-9]+)' ], ResourceType.ISBN: [ - r'ISBN(?:-1[03])?:?\s*(?=[0-9X]{10}|(?=(?:[0-9]+[- ]){3})[- 0-9X]{13}|97[89][0-9]{10}|(?=(?:[0-9]+[- ]){4})[- 0-9]{17})(?:97[89][- ]?)?[0-9]{1,5}[- ]?[0-9]+[- ]?[0-9]+[- ]?[0-9X]' + r'ISBN(?:-1[03])?:?\s*(?=[0-9X]{10})(?:97[89][- ]?)?[0-9]{1,5}[- ]?[0-9]+[- ]?[0-9]+[- ]?[0-9X]', ], ResourceType.ARXIV: [ r'arXiv:\s*(\d{4}\.\d{4,5}(v\d+)?)', r'arxiv:\s*([a-z\-]+/\d{7})' ], - ResourceType.PMID: [ - r'PMID:\s*(\d+)', - r'PubMed ID:\s*(\d+)' - ] } - def forward(self, text: str, source_name: str = "unknown") -> List[Dict[str, Any]]: - """Extract citations from text""" - citations = [] - text_lower = text.lower() + def forward(self, text: str, nebius_api_key: str = None, + use_ai_enhancement: bool = False) -> List[Dict[str, Any]]: + """Extraer referencias con opción de mejora con Nebius""" + # Extracción básica + basic_references = self._extract_basic(text) + + if not use_ai_enhancement or not nebius_api_key: + return basic_references + + # Mejora con Nebius AI + try: + nebius = NebiusAPI(nebius_api_key) + + # Usar asyncio en contexto síncrono + import nest_asyncio + nest_asyncio.apply() + + # Extraer con Nebius + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + nebius_references = loop.run_until_complete( + nebius.extract_references(text[:10000]) # Limitar para API + ) + loop.close() + + # Combinar resultados + enhanced_references = self._merge_references(basic_references, nebius_references) + return enhanced_references + + except Exception as e: + logger.error(f"Error using Nebius enhancement: {e}") + return basic_references + + def _extract_basic(self, text: str) -> List[Dict[str, Any]]: + """Extracción básica de referencias""" + references = [] - # Buscar por tipo de recurso for resource_type, patterns in self.patterns.items(): for pattern in patterns: matches = re.finditer(pattern, text, re.IGNORECASE) for match in matches: identifier = match.group(1) if match.groups() else match.group(0) - - # Limpiar identificador identifier = self._clean_identifier(identifier, resource_type) if identifier: - # Calcular confianza basada en el contexto - confidence = self._calculate_confidence( - identifier, resource_type, text_lower, match.start() - ) - - citation = CitationModel( - id=hashlib.md5( - f"{identifier}_{source_name}".encode() - ).hexdigest()[:12], - raw_text=match.group(0), - resource_type=resource_type, - identifier=identifier, - metadata={ - "found_at": match.start(), - "context": self._get_context(text, match.start(), match.end()) - }, - confidence=confidence, - extracted_from=source_name, - position=(match.start(), match.end()) - ) - citations.append(citation.dict()) - - # Extraer URLs generales (solo si parecen académicas) - url_pattern = r'https?://[^\s<>"]+|www\.[^\s<>"]+' - url_matches = re.finditer(url_pattern, text) - - for match in url_matches: - url = match.group(0) - if self._is_academic_url(url): - citation = CitationModel( - id=hashlib.md5(f"{url}_{source_name}".encode()).hexdigest()[:12], - raw_text=url, - resource_type=ResourceType.URL, - identifier=url, - metadata={ - "found_at": match.start(), - "context": self._get_context(text, match.start(), match.end()) - }, - confidence=0.6, - extracted_from=source_name, - position=(match.start(), match.end()) - ) - citations.append(citation.dict()) + reference = { + "id": hashlib.md5(identifier.encode()).hexdigest()[:12], + "raw_text": match.group(0), + "type": resource_type.value, + "identifier": identifier, + "confidence": 0.8, + "context": self._get_context(text, match.start(), match.end()), + "position": (match.start(), match.end()), + "extraction_method": "regex" + } + references.append(reference) + + return references + + def _merge_references(self, basic: List[Dict], nebius: List[Dict]) -> List[Dict]: + """Combinar referencias de extracción básica y Nebius""" + merged = basic.copy() + + for nebius_ref in nebius: + # Verificar si ya existe + exists = False + for ref in merged: + if ref.get('identifier') == nebius_ref.get('identifier'): + exists = True + # Actualizar confianza y metadata + ref['confidence'] = max(ref.get('confidence', 0), + nebius_ref.get('confidence', 0)) + ref['extraction_method'] = 'regex+nebius' + break + + if not exists: + # Convertir formato Nebius a nuestro formato + new_ref = { + "id": hashlib.md5( + nebius_ref.get('identifier', '').encode() + ).hexdigest()[:12], + "raw_text": nebius_ref.get('raw_text', ''), + "type": nebius_ref.get('type', 'unknown'), + "identifier": nebius_ref.get('identifier', ''), + "confidence": nebius_ref.get('confidence', 0.7), + "context": nebius_ref.get('context', ''), + "position": (0, 0), + "extraction_method": 'nebius' + } + merged.append(new_ref) - return citations + return merged def _clean_identifier(self, identifier: str, resource_type: ResourceType) -> str: - """Clean identifier""" + """Limpiar identificador""" identifier = identifier.strip() - # Eliminar prefijos prefixes = ['doi:', 'DOI:', 'arxiv:', 'arXiv:', 'isbn:', 'ISBN:', 'pmid:', 'PMID:'] for prefix in prefixes: if identifier.startswith(prefix): identifier = identifier[len(prefix):].strip() - # Limpiar caracteres no deseados identifier = identifier.strip('"\'<>()[]{}') - return identifier - - def _calculate_confidence(self, identifier: str, resource_type: ResourceType, - text: str, position: int) -> float: - """Calculate confidence score for extracted citation""" - confidence = 0.7 # Base confidence - - # Verificar formato DOI - if resource_type == ResourceType.DOI: - if re.match(r'^10\.\d{4,9}/.+', identifier): - confidence += 0.2 - - # Verificar contexto - context_words = ['paper', 'article', 'journal', 'conference', 'published', - 'reference', 'bibliography', 'cite', 'doi', 'url'] - - context = text[max(0, position-100):min(len(text), position+100)] - for word in context_words: - if word in context.lower(): - confidence += 0.05 - - return min(confidence, 1.0) - - def _is_academic_url(self, url: str) -> bool: - """Check if URL looks academic""" - academic_domains = [ - 'arxiv.org', 'doi.org', 'springer.com', 'ieee.org', 'acm.org', - 'sciencedirect.com', 'wiley.com', 'tandfonline.com', 'nature.com', - 'science.org', 'pnas.org', 'plos.org', 'bmc.com', 'frontiersin.org', - 'mdpi.com', 'researchgate.net', 'semanticscholar.org' - ] + if resource_type == ResourceType.URL: + if not identifier.startswith(('http://', 'https://')): + identifier = f'https://{identifier}' - url_lower = url.lower() - return any(domain in url_lower for domain in academic_domains) + return identifier - def _get_context(self, text: str, start: int, end: int, window: int = 50) -> str: - """Get context around match""" + def _get_context(self, text: str, start: int, end: int, window: int = 100) -> str: + """Obtener contexto alrededor del match""" context_start = max(0, start - window) context_end = min(len(text), end + window) return text[context_start:context_end] -class ResourceVerificationTool(Tool): - name = "verify_resource" +class NebiusVerificationTool(Tool): + name = "nebius_verify_reference" description = """ - Verify the existence and accessibility of academic resources. + Verify academic references using Nebius AI analysis. Args: - citation (Dict[str, Any]): Citation to verify - timeout (int): Timeout in seconds + reference (Dict): Reference to verify + nebius_api_key (str): Nebius API key + deep_verify (bool): Whether to perform deep verification Returns: - VerificationResult: Verification result with metadata + Dict: Verification results with Nebius analysis """ def __init__(self): @@ -245,74 +364,88 @@ class ResourceVerificationTool(Tool): 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' } - def forward(self, citation: Dict[str, Any], timeout: int = 10) -> Dict[str, Any]: - """Verify a citation""" - citation_obj = CitationModel(**citation) - - # Preparar resultado + def forward(self, reference: Dict[str, Any], nebius_api_key: str = None, + deep_verify: bool = False) -> Dict[str, Any]: + """Verificar referencia con Nebius""" result = { - "citation": citation_obj.dict(), + "reference": reference, "verified": False, - "verification_source": "none", + "verification_source": "direct", "download_url": None, "file_format": None, "file_size": None, "quality_score": 0.0, - "notes": [] + "notes": [], + "nebius_analysis": None } - try: - if citation_obj.resource_type == ResourceType.DOI: - return self._verify_doi(citation_obj, timeout) - elif citation_obj.resource_type == ResourceType.ARXIV: - return self._verify_arxiv(citation_obj, timeout) - elif citation_obj.resource_type == ResourceType.URL: - return self._verify_url(citation_obj, timeout) - elif citation_obj.resource_type == ResourceType.ISBN: - return self._verify_isbn(citation_obj, timeout) - elif citation_obj.resource_type == ResourceType.PMID: - return self._verify_pmid(citation_obj, timeout) - else: - result["notes"].append(f"Unsupported resource type: {citation_obj.resource_type}") + # Verificación directa primero + direct_result = self._direct_verification(reference) + if direct_result.get("verified"): + result.update(direct_result) + result["quality_score"] = 0.9 - except Exception as e: - result["notes"].append(f"Verification error: {str(e)}") + # Verificación con Nebius si está disponible + if nebius_api_key and deep_verify: + nebius_result = self._nebius_verification(reference, nebius_api_key) + result["nebius_analysis"] = nebius_result + + if nebius_result.get("valid", False): + result["verified"] = True + result["verification_source"] = "nebius" + result["quality_score"] = max( + result.get("quality_score", 0), + nebius_result.get("confidence", 0) + ) + + # Agregar URLs sugeridas por Nebius + suggested_urls = nebius_result.get("suggested_urls", []) + if suggested_urls and not result.get("download_url"): + result["download_url"] = suggested_urls[0] + + result["notes"].append( + f"Nebius analysis: {nebius_result.get('notes', 'No notes')}" + ) return result - def _verify_doi(self, citation: CitationModel, timeout: int) -> Dict[str, Any]: - """Verify DOI""" + def _direct_verification(self, reference: Dict[str, Any]) -> Dict[str, Any]: + """Verificación directa de la referencia""" import requests - result = { - "citation": citation.dict(), - "verified": False, - "verification_source": "crossref", - "download_url": None, - "file_format": None, - "file_size": None, - "quality_score": 0.0, - "notes": [] - } + ref_type = reference.get("type", "") + identifier = reference.get("identifier", "") try: - # Try Crossref API - url = f"https://api.crossref.org/works/{citation.identifier}" - response = requests.get(url, headers=self.headers, timeout=timeout) + if ref_type == "doi": + return self._verify_doi(identifier) + elif ref_type == "arxiv": + return self._verify_arxiv(identifier) + elif ref_type == "url": + return self._verify_url(identifier) + elif ref_type == "isbn": + return self._verify_isbn(identifier) + except Exception as e: + logger.error(f"Direct verification error: {e}") + + return {"verified": False, "notes": [f"Direct verification failed for {ref_type}"]} + + def _verify_doi(self, doi: str) -> Dict[str, Any]: + """Verificar DOI""" + import requests + + try: + # Crossref + url = f"https://api.crossref.org/works/{doi}" + response = requests.get(url, headers=self.headers, timeout=10) if response.status_code == 200: data = response.json() work = data.get('message', {}) - result["verified"] = True - result["quality_score"] = 0.9 - - # Check for open access - if work.get('license'): - result["notes"].append("Open access available") - result["quality_score"] += 0.1 + result = {"verified": True, "notes": ["Verified via Crossref"]} - # Try to find PDF URL + # Buscar PDF links = work.get('link', []) for link in links: if link.get('content-type') == 'application/pdf': @@ -320,1061 +453,878 @@ class ResourceVerificationTool(Tool): result["file_format"] = "pdf" break - # Try Unpaywall - if not result["download_url"]: - unpaywall_url = f"https://api.unpaywall.org/v2/{citation.identifier}?email=user@example.com" - unpaywall_response = requests.get(unpaywall_url, timeout=timeout) - if unpaywall_response.status_code == 200: - unpaywall_data = unpaywall_response.json() - if unpaywall_data.get('is_oa'): - result["download_url"] = unpaywall_data.get('best_oa_location', {}).get('url') - result["verification_source"] = "unpaywall" - - else: - result["notes"].append(f"Crossref API returned {response.status_code}") - + return result except Exception as e: - result["notes"].append(f"DOI verification error: {str(e)}") + logger.error(f"DOI verification error: {e}") - return result + return {"verified": False} - def _verify_arxiv(self, citation: CitationModel, timeout: int) -> Dict[str, Any]: - """Verify arXiv ID""" + def _verify_arxiv(self, arxiv_id: str) -> Dict[str, Any]: + """Verificar arXiv ID""" import requests - result = { - "citation": citation.dict(), - "verified": False, - "verification_source": "arxiv", - "download_url": None, - "file_format": None, - "file_size": None, - "quality_score": 0.0, - "notes": [] - } - try: - # Clean arXiv ID - arxiv_id = citation.identifier + # Limpiar ID if 'arxiv:' in arxiv_id.lower(): arxiv_id = arxiv_id.split(':')[-1].strip() - # Check arXiv API + # Verificar existencia api_url = f"http://export.arxiv.org/api/query?id_list={arxiv_id}" - response = requests.get(api_url, headers=self.headers, timeout=timeout) + response = requests.get(api_url, headers=self.headers, timeout=10) if response.status_code == 200: - result["verified"] = True - result["quality_score"] = 0.95 - result["download_url"] = f"https://arxiv.org/pdf/{arxiv_id}.pdf" - result["file_format"] = "pdf" - result["notes"].append("arXiv paper available") - + return { + "verified": True, + "download_url": f"https://arxiv.org/pdf/{arxiv_id}.pdf", + "file_format": "pdf", + "notes": ["arXiv paper available"] + } except Exception as e: - result["notes"].append(f"arXiv verification error: {str(e)}") + logger.error(f"arXiv verification error: {e}") - return result + return {"verified": False} - def _verify_url(self, citation: CitationModel, timeout: int) -> Dict[str, Any]: - """Verify URL""" + def _verify_url(self, url: str) -> Dict[str, Any]: + """Verificar URL""" import requests - result = { - "citation": citation.dict(), - "verified": False, - "verification_source": "direct", - "download_url": None, - "file_format": None, - "file_size": None, - "quality_score": 0.0, - "notes": [] - } - try: - response = requests.head( - citation.identifier, - headers=self.headers, - timeout=timeout, - allow_redirects=True - ) + response = requests.head(url, headers=self.headers, timeout=10, allow_redirects=True) if response.status_code == 200: - content_type = response.headers.get('content-type', '') - - result["verified"] = True - result["quality_score"] = 0.7 - result["download_url"] = citation.identifier + result = {"verified": True, "notes": [f"URL accessible: {response.status_code}"]} - # Check if it's a PDF + # Verificar si es PDF + content_type = response.headers.get('content-type', '') if 'application/pdf' in content_type: + result["download_url"] = url result["file_format"] = "pdf" - result["quality_score"] += 0.2 - - # Try to get file size - content_length = response.headers.get('content-length') - if content_length: - result["file_size"] = int(content_length) - result["notes"].append(f"Content-Type: {content_type}") - + return result except Exception as e: - result["notes"].append(f"URL verification error: {str(e)}") + logger.error(f"URL verification error: {e}") - return result + return {"verified": False} - def _verify_isbn(self, citation: CitationModel, timeout: int) -> Dict[str, Any]: - """Verify ISBN""" + def _verify_isbn(self, isbn: str) -> Dict[str, Any]: + """Verificar ISBN""" import requests - result = { - "citation": citation.dict(), - "verified": False, - "verification_source": "openlibrary", - "download_url": None, - "file_format": None, - "file_size": None, - "quality_score": 0.0, - "notes": [] - } - try: - # Try Open Library API - url = f"https://openlibrary.org/api/books?bibkeys=ISBN:{citation.identifier}&format=json" - response = requests.get(url, headers=self.headers, timeout=timeout) + # Open Library + url = f"https://openlibrary.org/api/books?bibkeys=ISBN:{isbn}&format=json" + response = requests.get(url, headers=self.headers, timeout=10) if response.status_code == 200: data = response.json() if data: - result["verified"] = True - result["quality_score"] = 0.8 - result["notes"].append("ISBN found in Open Library") - + return { + "verified": True, + "notes": ["ISBN found in Open Library"] + } except Exception as e: - result["notes"].append(f"ISBN verification error: {str(e)}") + logger.error(f"ISBN verification error: {e}") - return result + return {"verified": False} - def _verify_pmid(self, citation: CitationModel, timeout: int) -> Dict[str, Any]: - """Verify PMID""" - import requests - - result = { - "citation": citation.dict(), - "verified": False, - "verification_source": "pubmed", - "download_url": None, - "file_format": None, - "file_size": None, - "quality_score": 0.0, - "notes": [] - } - + def _nebius_verification(self, reference: Dict[str, Any], api_key: str) -> Dict[str, Any]: + """Verificación con Nebius AI""" try: - # Try PubMed API - url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&id={citation.identifier}&retmode=json" - response = requests.get(url, headers=self.headers, timeout=timeout) + nebius = NebiusAPI(api_key) - if response.status_code == 200: - data = response.json() - if data.get('result', {}).get(citation.identifier): - result["verified"] = True - result["quality_score"] = 0.85 - result["notes"].append("PMID found in PubMed") - - except Exception as e: - result["notes"].append(f"PMID verification error: {str(e)}") - - return result - -class PaperDownloadTool(Tool): - name = "download_paper" - description = """ - Download academic paper from verified source. - - Args: - verification_result (Dict[str, Any]): Verified resource to download - output_dir (str): Directory to save downloaded file - - Returns: - Dict[str, Any]: Download result with file path and metadata - """ - - def __init__(self): - super().__init__() - self.headers = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' - } - - def forward(self, verification_result: Dict[str, Any], - output_dir: str = "downloads") -> Dict[str, Any]: - """Download paper""" - import requests - import os - - result = { - "success": False, - "file_path": None, - "file_size": 0, - "download_time": 0, - "error": None, - "metadata": verification_result - } - - try: - # Create output directory - os.makedirs(output_dir, exist_ok=True) - - download_url = verification_result.get("download_url") - if not download_url: - result["error"] = "No download URL available" - return result + # Usar asyncio en contexto síncrono + import nest_asyncio + nest_asyncio.apply() - # Generate filename - citation = verification_result.get("citation", {}) - identifier = citation.get("identifier", "unknown") - file_ext = verification_result.get("file_format", "pdf") - - # Clean filename - filename = re.sub(r'[^\w\-\.]', '_', identifier) - if not filename.endswith(f'.{file_ext}'): - filename = f"{filename}.{file_ext}" - - file_path = os.path.join(output_dir, filename) - - # Download file - start_time = datetime.now() - response = requests.get( - download_url, - headers=self.headers, - stream=True, - timeout=30 + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + analysis = loop.run_until_complete( + nebius.verify_reference(reference) ) + loop.close() - if response.status_code == 200: - with open(file_path, 'wb') as f: - for chunk in response.iter_content(chunk_size=8192): - if chunk: - f.write(chunk) - - download_time = (datetime.now() - start_time).total_seconds() - file_size = os.path.getsize(file_path) - - result["success"] = True - result["file_path"] = file_path - result["file_size"] = file_size - result["download_time"] = download_time - - logger.info(f"Downloaded {filename} ({file_size} bytes)") - else: - result["error"] = f"HTTP {response.status_code}" - - except Exception as e: - result["error"] = str(e) - logger.error(f"Download error: {e}") - - return result - -class FileProcessingTool(Tool): - name = "process_file" - description = """ - Process different file types to extract text for bibliography extraction. - - Args: - file_path (str): Path to the file - file_type (str): Type of file (auto-detected if None) - - Returns: - Dict[str, Any]: Extracted text and metadata - """ - - def __init__(self): - super().__init__() - - def forward(self, file_path: str, file_type: str = None) -> Dict[str, Any]: - """Process file and extract text""" - import os - - result = { - "success": False, - "text": "", - "file_type": file_type, - "file_size": 0, - "error": None, - "metadata": {} - } - - try: - if not os.path.exists(file_path): - result["error"] = "File not found" - return result - - file_size = os.path.getsize(file_path) - result["file_size"] = file_size - - # Determine file type - if not file_type: - file_type = self._detect_file_type(file_path) - - result["file_type"] = file_type - - # Process based on file type - if file_type == "txt": - with open(file_path, 'r', encoding='utf-8', errors='ignore') as f: - result["text"] = f.read() - result["success"] = True + return analysis - elif file_type == "pdf": - result["text"] = self._extract_from_pdf(file_path) - result["success"] = True - - elif file_type == "docx": - result["text"] = self._extract_from_docx(file_path) - result["success"] = True - - elif file_type == "html": - with open(file_path, 'r', encoding='utf-8', errors='ignore') as f: - html_content = f.read() - result["text"] = self._extract_from_html(html_content) - result["success"] = True - - else: - # Try as text file - try: - with open(file_path, 'r', encoding='utf-8', errors='ignore') as f: - result["text"] = f.read() - result["success"] = True - except: - result["error"] = f"Unsupported file type: {file_type}" - except Exception as e: - result["error"] = str(e) - - return result - - def _detect_file_type(self, file_path: str) -> str: - """Detect file type from extension""" - ext = os.path.splitext(file_path)[1].lower() - - type_mapping = { - '.txt': 'txt', - '.pdf': 'pdf', - '.docx': 'docx', - '.doc': 'doc', - '.html': 'html', - '.htm': 'html', - '.md': 'markdown', - '.rtf': 'rtf' - } - - return type_mapping.get(ext, 'unknown') - - def _extract_from_pdf(self, file_path: str) -> str: - """Extract text from PDF""" - try: - # Try PyPDF2 - import PyPDF2 - text = "" - with open(file_path, 'rb') as file: - pdf_reader = PyPDF2.PdfReader(file) - for page in pdf_reader.pages: - text += page.extract_text() - return text - except ImportError: - logger.warning("PyPDF2 not installed, using fallback") - # Fallback: use pdftotext command if available - import subprocess - try: - result = subprocess.run( - ['pdftotext', file_path, '-'], - capture_output=True, - text=True - ) - if result.returncode == 0: - return result.stdout - except: - pass - return "" - - def _extract_from_docx(self, file_path: str) -> str: - """Extract text from DOCX""" - try: - from docx import Document - doc = Document(file_path) - return "\n".join([paragraph.text for paragraph in doc.paragraphs]) - except ImportError: - logger.warning("python-docx not installed") - return "" - except Exception as e: - logger.error(f"Error reading DOCX: {e}") - return "" - - def _extract_from_html(self, html_content: str) -> str: - """Extract text from HTML""" - try: - from bs4 import BeautifulSoup - soup = BeautifulSoup(html_content, 'html.parser') - # Remove script and style elements - for script in soup(["script", "style"]): - script.decompose() - return soup.get_text() - except ImportError: - # Simple regex-based extraction - import re - text = re.sub(r'<[^>]+>', ' ', html_content) - text = re.sub(r'\s+', ' ', text) - return text + logger.error(f"Nebius verification error: {e}") + return {"valid": False, "confidence": 0.0, "notes": f"Error: {str(e)}"} -# ========== AGENTES PRINCIPALES ========== +# ========== SISTEMA PRINCIPAL CON NEBIUS ========== -class BibliographyProcessingSystem: - """Sistema principal de procesamiento bibliográfico usando smolagents""" +class NebiusBibliographySystem: + """Sistema de procesamiento bibliográfico con Nebius AI""" - def __init__(self, model_config: Dict[str, Any] = None): - self.model_config = model_config or { - "model_id": "gpt-4", - "api_key": os.getenv("OPENAI_API_KEY", ""), - "provider": "openai" - } + def __init__(self, config: Dict[str, Any]): + self.config = config + self.nebius_api_key = config.get("nebius_api_key") + self.use_nebius = bool(self.nebius_api_key) # Inicializar herramientas - self.extraction_tool = BibliographyExtractionTool() - self.verification_tool = ResourceVerificationTool() - self.download_tool = PaperDownloadTool() - self.file_tool = FileProcessingTool() + self.extraction_tool = NebiusEnhancedExtractionTool() + self.verification_tool = NebiusVerificationTool() - # Crear agentes - self.extraction_agent = self._create_extraction_agent() - self.verification_agent = self._create_verification_agent() - self.download_agent = self._create_download_agent() + # Configurar modelo LiteLLM para agentes + self.llm_model = self._configure_llm() - # Directorios - self.output_dir = "bibliography_output" - self.download_dir = os.path.join(self.output_dir, "downloads") - self.report_dir = os.path.join(self.output_dir, "reports") + # Directorios de salida + self.output_base = "nebius_bibliography" + self.download_dir = os.path.join(self.output_base, "downloads") + self.report_dir = os.path.join(self.output_base, "reports") + self.log_dir = os.path.join(self.output_base, "logs") # Crear directorios - os.makedirs(self.output_dir, exist_ok=True) - os.makedirs(self.download_dir, exist_ok=True) - os.makedirs(self.report_dir, exist_ok=True) - - # Estado - self.current_process_id = None - self.processing_results = {} - - def _create_extraction_agent(self) -> ToolCallingAgent: - """Crear agente de extracción""" - model = self._create_model() - - agent = ToolCallingAgent( - tools=[self.extraction_tool, self.file_tool], - model=model, - name="ExtractionAgent", - description="Extract bibliographic references from documents", - max_steps=10 - ) - - return agent - - def _create_verification_agent(self) -> ToolCallingAgent: - """Crear agente de verificación""" - model = self._create_model() - - agent = ToolCallingAgent( - tools=[self.verification_tool], - model=model, - name="VerificationAgent", - description="Verify the existence and accessibility of academic resources", - max_steps=15 - ) - - return agent - - def _create_download_agent(self) -> ToolCallingAgent: - """Crear agente de descarga""" - model = self._create_model() - - agent = ToolCallingAgent( - tools=[self.download_tool], - model=model, - name="DownloadAgent", - description="Download academic papers from verified sources", - max_steps=20 - ) + for dir_path in [self.output_base, self.download_dir, self.report_dir, self.log_dir]: + os.makedirs(dir_path, exist_ok=True) + + # Estadísticas + self.stats = { + "total_processed": 0, + "total_references": 0, + "nebius_calls": 0, + "success_rate": 0.0 + } - return agent + logger.info(f"Nebius system initialized. Nebius AI: {'Enabled' if self.use_nebius else 'Disabled'}") - def _create_model(self): - """Crear modelo según configuración""" - provider = self.model_config.get("provider", "openai") + def _configure_llm(self): + """Configurar modelo LiteLLM""" + provider = self.config.get("llm_provider", "openai") - if provider == "openai": + if provider == "nebius" and self.nebius_api_key: + # Configurar Nebius como proveedor personalizado return LiteLLMModel( - model_id=self.model_config.get("model_id", "gpt-4"), - api_key=self.model_config.get("api_key") + model_id=self.config.get("llm_model", "neural-chat-7b-v3-1"), + api_key=self.nebius_api_key, + api_base=self.config.get("nebius_api_base", "https://api.studio.nebius.com/v1") ) - elif provider == "anthropic": + elif provider == "openai": return LiteLLMModel( - model_id="claude-3-opus-20240229", - api_key=self.model_config.get("api_key") - ) - elif provider == "huggingface": - from smolagents import InferenceClientModel - return InferenceClientModel( - model_id=self.model_config.get("model_id", "mistralai/Mixtral-8x7B-Instruct-v0.1") + model_id=self.config.get("llm_model", "gpt-4"), + api_key=self.config.get("openai_api_key") ) else: - # Default to OpenAI - return LiteLLMModel(model_id="gpt-4") + # Default to Nebius if available + if self.nebius_api_key: + return LiteLLMModel( + model_id="neural-chat-7b-v3-1", + api_key=self.nebius_api_key, + api_base="https://api.studio.nebius.com/v1" + ) + else: + return LiteLLMModel(model_id="gpt-4") async def process_document(self, file_path: str, process_id: str = None) -> Dict[str, Any]: - """Procesar documento completo""" + """Procesar documento completo con Nebius""" import time - start_time = time.time() # Generar ID de proceso - self.current_process_id = process_id or hashlib.md5( - f"{file_path}_{datetime.now().isoformat()}".encode() - ).hexdigest()[:8] - - logger.info(f"Starting process {self.current_process_id} for {file_path}") + process_id = process_id or self._generate_process_id(file_path) - # 1. Extraer texto del archivo - extraction_prompt = f""" - Process the file at {file_path} to extract all text content. - Focus on extracting any bibliographic references, citations, or academic resources. - - Steps: - 1. Use process_file tool to extract text - 2. Return the extracted text for further analysis - """ + logger.info(f"[{process_id}] Processing document: {file_path}") try: - # Ejecutar agente de extracción de archivos - file_result = await self.extraction_agent.run_async(extraction_prompt) - - if not file_result or "text" not in str(file_result): - return { - "success": False, - "error": "Failed to extract text from file", - "process_id": self.current_process_id - } - - # 2. Extraer referencias bibliográficas - text_content = str(file_result) - extraction_prompt2 = f""" - Analyze the following text and extract all bibliographic references: - - {text_content[:5000]}... # Limitar tamaño para el prompt - - Extract: - 1. DOIs (Digital Object Identifiers) - 2. ISBNs - 3. arXiv IDs - 4. PubMed IDs (PMID) - 5. Academic URLs - 6. Any other academic references - - Return a comprehensive list of all found references. - """ - - extraction_result = await self.extraction_agent.run_async(extraction_prompt2) + # 1. Leer archivo + file_content = self._read_file(file_path) + if not file_content: + return self._error_result(process_id, "Empty or unreadable file") + + # 2. Extraer referencias + logger.info(f"[{process_id}] Extracting references...") + references = self.extraction_tool.forward( + text=file_content, + nebius_api_key=self.nebius_api_key, + use_ai_enhancement=self.use_nebius + ) - # Parsear resultado (asumiendo que el agente devuelve texto JSON-like) - citations = [] - try: - # Intentar extraer JSON del resultado - import json - result_str = str(extraction_result) - - # Buscar patrón JSON - json_match = re.search(r'\{.*\}', result_str, re.DOTALL) - if json_match: - citations_data = json.loads(json_match.group()) - if isinstance(citations_data, list): - citations = [CitationModel(**c) for c in citations_data] - except: - # Fallback: usar la herramienta directamente - citations_data = self.extraction_tool.forward(text_content, os.path.basename(file_path)) - citations = [CitationModel(**c) for c in citations_data] + if self.use_nebius: + self.stats["nebius_calls"] += 1 - logger.info(f"Found {len(citations)} citations") + self.stats["total_references"] += len(references) + logger.info(f"[{process_id}] Found {len(references)} references") - # 3. Verificar recursos - verified_resources = [] + # 3. Verificar referencias + logger.info(f"[{process_id}] Verifying references...") + verification_results = [] failed_verifications = [] - for citation in citations: - verification_prompt = f""" - Verify the following academic resource: + for i, ref in enumerate(references): + if i % 5 == 0: # Log cada 5 referencias + logger.info(f"[{process_id}] Verified {i}/{len(references)}") - Type: {citation.resource_type} - Identifier: {citation.identifier} - Source: {citation.extracted_from} - - Check if this resource exists and is accessible. - """ + # Verificar referencia + verification = self.verification_tool.forward( + reference=ref, + nebius_api_key=self.nebius_api_key, + deep_verify=self.use_nebius + ) - try: - verification_result = await self.verification_agent.run_async(verification_prompt) + if verification.get("verified"): + # Convertir a modelo + citation = CitationModel( + id=ref.get("id"), + raw_text=ref.get("raw_text", ""), + resource_type=ResourceType(ref.get("type", "unknown")), + identifier=ref.get("identifier", ""), + confidence=ref.get("confidence", 0.0), + extracted_from=file_path, + position=ref.get("position", (0, 0)), + nebius_verified=self.use_nebius, + nebius_confidence=verification.get("quality_score", 0.0) + ) - # Parsear resultado - if verification_result: - verification_dict = self.verification_tool.forward(citation.dict()) - verified_resource = VerificationResult(**verification_dict) - - if verified_resource.verified: - verified_resources.append(verified_resource) - else: - failed_verifications.append(citation) - except Exception as e: - logger.error(f"Verification error for {citation.identifier}: {e}") + vr = VerificationResult( + citation=citation, + verified=True, + verification_source=verification.get("verification_source", "unknown"), + download_url=verification.get("download_url"), + file_format=verification.get("file_format"), + file_size=verification.get("file_size"), + quality_score=verification.get("quality_score", 0.0), + notes=verification.get("notes", []), + nebius_analysis=verification.get("nebius_analysis") + ) + verification_results.append(vr) + else: + # Referencia fallida + citation = CitationModel( + id=ref.get("id"), + raw_text=ref.get("raw_text", ""), + resource_type=ResourceType(ref.get("type", "unknown")), + identifier=ref.get("identifier", ""), + confidence=ref.get("confidence", 0.0), + extracted_from=file_path, + position=ref.get("position", (0, 0)), + nebius_verified=False, + nebius_confidence=0.0 + ) failed_verifications.append(citation) - # 4. Descargar recursos verificados - downloaded_files = [] - - for verified_resource in verified_resources: - if verified_resource.download_url: - download_prompt = f""" - Download the academic paper from: - - URL: {verified_resource.download_url} - Format: {verified_resource.file_format} - - Save it to: {self.download_dir} - """ - - try: - download_result = await self.download_agent.run_async(download_prompt) - - if download_result: - download_dict = self.download_tool.forward( - verified_resource.dict(), - self.download_dir - ) - - if download_dict.get("success"): - downloaded_files.append(download_dict.get("file_path")) - except Exception as e: - logger.error(f"Download error: {e}") + # 4. Descargar archivos verificados + logger.info(f"[{process_id}] Downloading files...") + downloaded_files = await self._download_files( + verification_results, + process_id + ) # 5. Generar reporte processing_time = time.time() - start_time report = ProcessingReport( input_file=file_path, - total_citations=len(citations), - verified_resources=verified_resources, + total_citations=len(references), + verified_resources=verification_results, downloaded_files=downloaded_files, failed_verifications=failed_verifications, processing_time=processing_time, summary={ - "success_rate": len(verified_resources) / max(1, len(citations)), - "download_rate": len(downloaded_files) / max(1, len(verified_resources)), - "file_count": len(downloaded_files) + "success_rate": len(verification_results) / max(1, len(references)), + "download_rate": len(downloaded_files) / max(1, len(verification_results)), + "avg_quality": sum(vr.quality_score for vr in verification_results) / max(1, len(verification_results)) + }, + nebius_usage={ + "enabled": self.use_nebius, + "calls": self.stats["nebius_calls"], + "enhanced_references": sum(1 for vr in verification_results if vr.nebius_analysis) } ) - # Guardar reporte - report_path = os.path.join( - self.report_dir, - f"report_{self.current_process_id}.json" - ) - - with open(report_path, 'w', encoding='utf-8') as f: - json.dump(report.dict(), f, indent=2, default=str) + # 6. Guardar resultados + self._save_results(report, process_id) - # 6. Crear archivo ZIP con resultados - zip_path = self._create_results_zip(report) + self.stats["total_processed"] += 1 + self.stats["success_rate"] = report.summary.get("success_rate", 0.0) - # Guardar resultados en estado - self.processing_results[self.current_process_id] = { - "report": report.dict(), - "zip_path": zip_path, - "timestamp": datetime.now().isoformat() - } - - logger.info(f"Process {self.current_process_id} completed in {processing_time:.2f}s") + logger.info(f"[{process_id}] Processing completed in {processing_time:.2f}s") return { "success": True, - "process_id": self.current_process_id, + "process_id": process_id, "report": report.dict(), - "zip_path": zip_path, + "zip_path": self._create_zip(report, process_id), "summary": { - "citations_found": len(citations), - "resources_verified": len(verified_resources), - "files_downloaded": len(downloaded_files), - "processing_time": processing_time + "references_found": len(references), + "verified": len(verification_results), + "downloaded": len(downloaded_files), + "success_rate": f"{report.summary.get('success_rate', 0) * 100:.1f}%", + "processing_time": f"{processing_time:.2f}s" } } except Exception as e: - logger.error(f"Processing error: {e}") - return { - "success": False, - "error": str(e), - "process_id": self.current_process_id - } + logger.error(f"[{process_id}] Processing error: {e}") + return self._error_result(process_id, str(e)) + + def _read_file(self, file_path: str) -> str: + """Leer contenido del archivo""" + try: + with open(file_path, 'r', encoding='utf-8', errors='ignore') as f: + return f.read() + except Exception as e: + logger.error(f"Error reading file {file_path}: {e}") + return "" + + async def _download_files(self, verification_results: List[VerificationResult], + process_id: str) -> List[str]: + """Descargar archivos de URLs verificadas""" + downloaded_files = [] + + for i, vr in enumerate(verification_results): + if vr.download_url: + try: + file_path = await self._download_file( + vr.download_url, + vr.citation.identifier, + process_id, + i + ) + if file_path: + downloaded_files.append(file_path) + except Exception as e: + logger.error(f"Download failed for {vr.citation.identifier}: {e}") + + return downloaded_files + + async def _download_file(self, url: str, identifier: str, + process_id: str, index: int) -> Optional[str]: + """Descargar un archivo individual""" + import aiohttp + + try: + # Crear nombre de archivo seguro + safe_name = re.sub(r'[^\w\-\.]', '_', identifier) + if len(safe_name) > 100: + safe_name = safe_name[:100] + + # Determinar extensión + extension = self._get_extension_from_url(url) + if not extension: + extension = ".pdf" # Default + + filename = f"{process_id}_{index:03d}_{safe_name}{extension}" + filepath = os.path.join(self.download_dir, filename) + + # Descargar + timeout = aiohttp.ClientTimeout(total=60) + async with aiohttp.ClientSession(timeout=timeout) as session: + async with session.get(url, headers={'User-Agent': 'Mozilla/5.0'}) as response: + if response.status == 200: + content = await response.read() + + # Verificar que sea un archivo válido + if len(content) > 100: # Archivo no vacío + with open(filepath, 'wb') as f: + f.write(content) + + logger.info(f"Downloaded: {filename} ({len(content)} bytes)") + return filepath + + return None + + except Exception as e: + logger.error(f"Download error for {url}: {e}") + return None + + def _get_extension_from_url(self, url: str) -> str: + """Obtener extensión de archivo desde URL""" + url_lower = url.lower() + + if '.pdf' in url_lower: + return '.pdf' + elif '.docx' in url_lower or '.doc' in url_lower: + return '.docx' + elif '.html' in url_lower or '.htm' in url_lower: + return '.html' + elif '.txt' in url_lower: + return '.txt' + elif '.epub' in url_lower: + return '.epub' + + return "" + + def _generate_process_id(self, file_path: str) -> str: + """Generar ID único de proceso""" + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + file_hash = hashlib.md5(file_path.encode()).hexdigest()[:6] + return f"NB_{timestamp}_{file_hash}" + + def _save_results(self, report: ProcessingReport, process_id: str): + """Guardar resultados en disco""" + # Guardar reporte JSON + report_path = os.path.join(self.report_dir, f"{process_id}_report.json") + with open(report_path, 'w', encoding='utf-8') as f: + json.dump(report.dict(), f, indent=2, default=str) + + # Guardar resumen en texto + summary_path = os.path.join(self.report_dir, f"{process_id}_summary.txt") + with open(summary_path, 'w', encoding='utf-8') as f: + f.write(self._generate_text_summary(report)) - def _create_results_zip(self, report: ProcessingReport) -> str: + def _create_zip(self, report: ProcessingReport, process_id: str) -> str: """Crear archivo ZIP con resultados""" import zipfile - from datetime import datetime - timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") - zip_filename = f"bibliography_results_{timestamp}.zip" - zip_path = os.path.join(self.output_dir, zip_filename) + zip_path = os.path.join(self.output_base, f"{process_id}_results.zip") with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf: - # Agregar reporte - report_path = os.path.join( - self.report_dir, - f"report_{self.current_process_id}.json" - ) - if os.path.exists(report_path): - zipf.write(report_path, "report.json") + # Agregar reportes + report_files = [ + f for f in os.listdir(self.report_dir) + if f.startswith(process_id) + ] + + for file in report_files: + filepath = os.path.join(self.report_dir, file) + zipf.write(filepath, f"reports/{file}") # Agregar archivos descargados for file_path in report.downloaded_files: if os.path.exists(file_path): - arcname = os.path.join("downloads", os.path.basename(file_path)) - zipf.write(file_path, arcname) + filename = os.path.basename(file_path) + zipf.write(file_path, f"downloads/{filename}") - # Agregar resumen en texto - summary_content = self._generate_summary_text(report) - zipf.writestr("summary.txt", summary_content) + # Agregar log + log_path = os.path.join(self.log_dir, f"{process_id}_log.txt") + with open(log_path, 'w') as f: + f.write(f"Process ID: {process_id}\n") + f.write(f"Time: {datetime.now().isoformat()}\n") + f.write(f"Success rate: {report.summary.get('success_rate', 0) * 100:.1f}%\n") + + zipf.write(log_path, "process_log.txt") return zip_path - def _generate_summary_text(self, report: ProcessingReport) -> str: + def _generate_text_summary(self, report: ProcessingReport) -> str: """Generar resumen en texto""" summary = f""" - BIBLIOGRAPHY PROCESSING REPORT - ============================== + NEBIUS BIBLIOGRAPHY PROCESSING REPORT + ===================================== - Process ID: {self.current_process_id} + Process ID: Generated automatically Input File: {report.input_file} Processing Time: {report.processing_time:.2f} seconds Timestamp: {report.timestamp} - STATISTICS - ---------- - Total Citations Found: {report.total_citations} - Resources Verified: {len(report.verified_resources)} + SUMMARY STATISTICS + ------------------ + Total References Found: {report.total_citations} + Successfully Verified: {len(report.verified_resources)} Files Downloaded: {len(report.downloaded_files)} - Failed Verifications: {len(report.failed_verifications)} + Verification Success Rate: {report.summary.get('success_rate', 0) * 100:.1f}% + Average Quality Score: {report.summary.get('avg_quality', 0):.2f} - Success Rate: {(len(report.verified_resources) / max(1, report.total_citations)) * 100:.1f}% - Download Rate: {(len(report.downloaded_files) / max(1, len(report.verified_resources))) * 100:.1f}% + NEBIUS AI USAGE + --------------- + Enabled: {report.nebius_usage.get('enabled', False)} + API Calls: {report.nebius_usage.get('calls', 0)} + Enhanced References: {report.nebius_usage.get('enhanced_references', 0)} - VERIFIED RESOURCES - ------------------ + VERIFIED RESOURCES (Top 10) + --------------------------- """ - for i, resource in enumerate(report.verified_resources, 1): - summary += f"\n{i}. {resource.citation.identifier}" - summary += f"\n Type: {resource.citation.resource_type}" - summary += f"\n Source: {resource.verification_source}" - summary += f"\n Quality: {resource.quality_score:.2f}" - if resource.download_url: + for i, vr in enumerate(report.verified_resources[:10], 1): + summary += f"\n{i}. {vr.citation.identifier}" + summary += f"\n Type: {vr.citation.resource_type.value}" + summary += f"\n Source: {vr.verification_source}" + summary += f"\n Quality: {vr.quality_score:.2f}" + summary += f"\n Nebius Enhanced: {vr.citation.nebius_verified}" + if vr.download_url: summary += f"\n Downloaded: Yes" - if resource.file_format: - summary += f" ({resource.file_format})" summary += "\n" if report.failed_verifications: - summary += f"\nFAILED VERIFICATIONS\n-------------------\n" - for citation in report.failed_verifications: - summary += f"- {citation.identifier} ({citation.resource_type})\n" + summary += f"\nFAILED VERIFICATIONS ({len(report.failed_verifications)})\n" + summary += "-" * 40 + "\n" + for citation in report.failed_verifications[:5]: + summary += f"- {citation.identifier} ({citation.resource_type.value})\n" - summary += f"\nFILES DOWNLOADED\n----------------\n" + summary += f"\nFILES DOWNLOADED\n" + summary += "-" * 40 + "\n" for file_path in report.downloaded_files: - file_size = os.path.getsize(file_path) if os.path.exists(file_path) else 0 - summary += f"- {os.path.basename(file_path)} ({file_size} bytes)\n" + if os.path.exists(file_path): + file_size = os.path.getsize(file_path) + summary += f"- {os.path.basename(file_path)} ({file_size} bytes)\n" return summary - def get_status(self, process_id: str = None) -> Dict[str, Any]: - """Obtener estado del proceso""" - pid = process_id or self.current_process_id - if pid and pid in self.processing_results: - return self.processing_results[pid] - return {"error": "Process not found"} + def _error_result(self, process_id: str, error: str) -> Dict[str, Any]: + """Generar resultado de error""" + return { + "success": False, + "process_id": process_id, + "error": error, + "timestamp": datetime.now().isoformat() + } - def cleanup(self, process_id: str = None): - """Limpiar archivos temporales""" - import shutil - - if process_id: - # Limpiar proceso específico - if process_id in self.processing_results: - del self.processing_results[process_id] - else: - # Limpiar todo - self.processing_results.clear() - - # Limpiar directorios (opcional, descomentar si se necesita) - # shutil.rmtree(self.download_dir, ignore_errors=True) - # shutil.rmtree(self.report_dir, ignore_errors=True) + def get_stats(self) -> Dict[str, Any]: + """Obtener estadísticas del sistema""" + return { + "total_processed": self.stats["total_processed"], + "total_references": self.stats["total_references"], + "nebius_calls": self.stats["nebius_calls"], + "success_rate": self.stats["success_rate"], + "output_directory": self.output_base + } -# ========== INTERFAZ GRADIO ========== +# ========== INTERFAZ GRADIO MEJORADA ========== -def create_gradio_interface(): - """Crear interfaz Gradio para el sistema""" +def create_nebius_interface(): + """Crear interfaz Gradio con soporte para Nebius""" system = None + current_process = None - def initialize_system(provider, model_id, api_key): + def initialize_system(provider, model, nebius_key, nebius_base, openai_key): """Inicializar sistema con configuración""" nonlocal system config = { - "provider": provider, - "model_id": model_id, - "api_key": api_key + "llm_provider": provider, + "llm_model": model, + "nebius_api_key": nebius_key, + "nebius_api_base": nebius_base or "https://api.studio.nebius.com/v1", + "openai_api_key": openai_key, + "use_nebius": bool(nebius_key) } try: - system = BibliographyProcessingSystem(config) - return "✅ Sistema inicializado correctamente" + system = NebiusBibliographySystem(config) + return "✅ Sistema inicializado con Nebius AI" if nebius_key else "✅ Sistema inicializado (sin Nebius)" except Exception as e: return f"❌ Error: {str(e)}" - async def process_file(file_obj, progress=gr.Progress()): - """Procesar archivo""" + async def process_document(file_obj, use_nebius, progress=gr.Progress()): + """Procesar documento""" + nonlocal system, current_process + if not system: - return None, "❌ Sistema no inicializado", "", "" + return None, "❌ Sistema no inicializado", "", "", "" try: - progress(0, desc="Iniciando procesamiento...") + progress(0, desc="Preparando archivo...") # Guardar archivo temporalmente import tempfile - with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(file_obj.name)[1]) as tmp: - with open(file_obj.name, 'rb') as src: - tmp.write(src.read()) - tmp_path = tmp.name + import shutil + + temp_dir = tempfile.mkdtemp() + file_path = os.path.join(temp_dir, file_obj.name) + shutil.copy(file_obj.name, file_path) - progress(0.2, desc="Extrayendo texto...") + progress(0.1, desc="Procesando con Nebius..." if use_nebius else "Procesando...") - # Procesar archivo - result = await system.process_document(tmp_path) + # Procesar documento + result = await system.process_document(file_path) if not result.get("success"): - return None, f"❌ Error: {result.get('error')}", "", "" + # Limpiar temporal + shutil.rmtree(temp_dir, ignore_errors=True) + return None, f"❌ Error: {result.get('error')}", "", "", "" - # Obtener reporte - report_data = result.get("report", {}) + current_process = result.get("process_id") summary = result.get("summary", {}) - progress(0.8, desc="Generando resultados...") - - # Preparar resultados para visualización - citations_found = summary.get("citations_found", 0) - verified = summary.get("resources_verified", 0) - downloaded = summary.get("files_downloaded", 0) - - # Generar HTML para visualización - html_output = f""" -
-

📊 Resultados del Procesamiento

- -
-

📈 Estadísticas

- -
- """ + progress(0.9, desc="Generando reportes...") - # Lista de recursos verificados - if verified > 0: - html_output += """ -
-

✅ Recursos Verificados

-
" - - # Lista de fallos - failed = len(report_data.get("failed_verifications", [])) - if failed > 0: - html_output += f""" -
-

❌ Recursos No Verificados ({failed})

-

Algunos recursos no pudieron ser verificados. Revisa el archivo ZIP para más detalles.

-
- """ - - html_output += "
" + # Generar visualizaciones + report_data = result.get("report", {}) - # Texto plano para exportación - text_output = f""" - Procesamiento Bibliográfico - =========================== + # HTML output + html_output = self._generate_html_report(report_data) - Archivo: {file_obj.name} - Proceso ID: {result.get('process_id')} - Fecha: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} + # Text output + text_output = self._generate_text_report(report_data) - Resumen: - - Referencias encontradas: {citations_found} - - Recursos verificados: {verified} - - Archivos descargados: {downloaded} - - Tasa de éxito: {(verified/max(1, citations_found))*100:.1f}% + # JSON output + json_output = json.dumps(report_data, indent=2, default=str) - Para ver el reporte completo, descarga el archivo ZIP. - """ + # Statistics + stats_output = self._generate_stats_display(summary) progress(1.0, desc="Completado!") - # Devolver resultados + # Limpiar temporal + shutil.rmtree(temp_dir, ignore_errors=True) + return ( result.get("zip_path"), - f"✅ Procesamiento completado. ID: {result.get('process_id')}", + f"✅ Proceso {current_process} completado", html_output, - text_output + text_output, + json_output, + stats_output ) except Exception as e: - logger.error(f"Error en procesamiento: {e}") - return None, f"❌ Error: {str(e)}", "", "" + logger.error(f"Processing error: {e}") + return None, f"❌ Error: {str(e)}", "", "", "", "" - def get_status(): - """Obtener estado del sistema""" - if not system or not system.current_process_id: - return "⚠️ No hay procesos activos" + def _generate_html_report(self, report_data: Dict) -> str: + """Generar reporte HTML""" + verified = len(report_data.get("verified_resources", [])) + total = report_data.get("total_citations", 0) + success_rate = (verified / max(1, total)) * 100 + + nebius_usage = report_data.get("nebius_usage", {}) + + html = f""" +
+

📊 Reporte de Procesamiento Nebius

+ +
+

📈 Resumen General

+
+
+ Referencias Encontradas
+ {total} +
+
+ Verificadas
+ {verified} +
+
+ Tasa de Éxito
+ {success_rate:.1f}% +
+
+ Tiempo
+ {report_data.get('processing_time', 0):.1f}s +
+
+
+ +
+

🤖 Nebius AI

+

Estado: {'✅ Activado' if nebius_usage.get('enabled') else '❌ Desactivado'}

+

Llamadas API: {nebius_usage.get('calls', 0)}

+

Referencias Mejoradas: {nebius_usage.get('enhanced_references', 0)}

+
+ +
+

📥 Descargas

+

Archivos Descargados: {len(report_data.get('downloaded_files', []))}

+ +
+ +
+

⚠️ Referencias No Verificadas

+

Total: {failed}

+ """.format(failed=len(report_data.get("failed_verifications", []))) + + html += """ +
+
+ """ + + return html + + def _generate_text_report(self, report_data: Dict) -> str: + """Generar reporte en texto""" + verified = len(report_data.get("verified_resources", [])) + total = report_data.get("total_citations", 0) + + text = f""" + REPORTE DE PROCESAMIENTO + ======================== + + Archivo: {report_data.get('input_file', 'Desconocido')} + Fecha: {report_data.get('timestamp', '')} + + ESTADÍSTICAS: + ------------- + • Referencias encontradas: {total} + • Referencias verificadas: {verified} + • Archivos descargados: {len(report_data.get('downloaded_files', []))} + • Tiempo de procesamiento: {report_data.get('processing_time', 0):.2f}s + • Tasa de éxito: {(verified/max(1, total))*100:.1f}% + + NEBIUS AI: + ---------- + • Estado: {'Activado' if report_data.get('nebius_usage', {}).get('enabled') else 'Desactivado'} + • Llamadas API: {report_data.get('nebius_usage', {}).get('calls', 0)} + • Referencias mejoradas: {report_data.get('nebius_usage', {}).get('enhanced_references', 0)} + + Para más detalles, consulte el archivo ZIP con el reporte completo. + """ + + return text + + def _generate_stats_display(self, summary: Dict) -> str: + """Generar display de estadísticas""" return f""" - 📊 Estado del Sistema - --------------------- - Proceso activo: {system.current_process_id} - Total procesos: {len(system.processing_results)} - Último reporte: {status.get('timestamp', 'N/A')} + ⚡ PROCESO COMPLETADO ⚡ + + 📊 Estadísticas Rápidas: + • Referencias: {summary.get('references_found', 0)} + • Verificadas: {summary.get('verified', 0)} + • Descargadas: {summary.get('downloaded', 0)} + • Tasa de éxito: {summary.get('success_rate', '0%')} + • Tiempo: {summary.get('processing_time', '0s')} + """ + + def get_system_stats(): + """Obtener estadísticas del sistema""" + nonlocal system + + if not system: + return "❌ Sistema no inicializado" + + stats = system.get_stats() + + return f""" + 📈 Estadísticas del Sistema Nebius: + + • Documentos procesados: {stats.get('total_processed', 0)} + • Referencias totales: {stats.get('total_references', 0)} + • Llamadas Nebius API: {stats.get('nebius_calls', 0)} + • Tasa de éxito promedio: {stats.get('success_rate', 0) * 100:.1f}% + • Directorio de salida: {stats.get('output_directory', 'N/A')} """ # Crear interfaz - with gr.Blocks(title="Sistema de Recopilación Bibliográfica", theme=gr.themes.Soft()) as interface: - gr.Markdown("# 📚 Sistema de Recopilación Bibliográfica con IA") - gr.Markdown("Procesa documentos y extrae referencias bibliográficas automáticamente") + with gr.Blocks(title="Nebius Bibliography System", theme=gr.themes.Soft()) as interface: + gr.Markdown("# 📚 Sistema de Recopilación Bibliográfica con Nebius AI") + gr.Markdown("Procesa documentos académicos usando Nebius AI para extracción y verificación inteligente") with gr.Row(): with gr.Column(scale=1): - gr.Markdown("### ⚙️ Configuración") + gr.Markdown("### ⚙️ Configuración Nebius AI") provider = gr.Dropdown( - choices=["openai", "anthropic", "huggingface"], - label="Proveedor de IA", - value="openai" + choices=["nebius", "openai"], + label="Proveedor de IA Principal", + value="nebius", + info="Selecciona Nebius para usar la API de Nebius AI" ) - model_id = gr.Textbox( - label="Model ID", - value="gpt-4", - placeholder="Ej: gpt-4, claude-3-opus-20240229, mistralai/Mixtral-8x7B-Instruct-v0.1" + model = gr.Textbox( + label="Modelo", + value="neural-chat-7b-v3-1", + placeholder="Modelo de Nebius (ej: neural-chat-7b-v3-1)" ) - api_key = gr.Textbox( - label="API Key", + nebius_key = gr.Textbox( + label="Nebius API Key", type="password", - placeholder="Ingresa tu API key" + placeholder="Ingresa tu API Key de Nebius" ) - init_btn = gr.Button("🚀 Inicializar Sistema", variant="primary") - init_status = gr.Markdown("") + nebius_base = gr.Textbox( + label="Nebius API Base (opcional)", + value="https://api.studio.nebius.com/v1", + placeholder="URL base de la API de Nebius" + ) - init_btn.click( - initialize_system, - inputs=[provider, model_id, api_key], - outputs=init_status + openai_key = gr.Textbox( + label="OpenAI API Key (respaldo)", + type="password", + placeholder="Opcional: Key de OpenAI como respaldo" ) + init_btn = gr.Button("🚀 Inicializar Sistema Nebius", variant="primary") + init_status = gr.Markdown("") + gr.Markdown("---") - status_btn = gr.Button("📊 Ver Estado") - system_status = gr.Markdown("") - status_btn.click(get_status, outputs=system_status) + stats_btn = gr.Button("📊 Estadísticas del Sistema") + system_stats = gr.Markdown("") with gr.Column(scale=2): gr.Markdown("### 📄 Procesar Documento") file_input = gr.File( label="Sube tu documento", - file_types=[".txt", ".pdf", ".docx", ".html", ".md", ".rtf"] + file_types=[".txt", ".pdf", ".docx", ".html", ".md"] ) - process_btn = gr.Button("🔍 Procesar Documento", variant="primary") + use_nebius = gr.Checkbox( + label="Usar Nebius AI para mejora de precisión", + value=True + ) + + process_btn = gr.Button("🔍 Procesar con Nebius AI", variant="primary") - gr.Markdown("### 📊 Resultados") + gr.Markdown("### 📦 Resultados") - result_file = gr.File(label="Descargar Resultados (ZIP)") + result_file = gr.File(label="Descargar Paquete Completo (ZIP)") result_status = gr.Markdown("") + stats_display = gr.Markdown("") with gr.Tabs(): with gr.TabItem("📋 Vista HTML"): - html_output = gr.HTML(label="Resultados Detallados") + html_output = gr.HTML(label="Reporte Interactivo") with gr.TabItem("📝 Texto Plano"): text_output = gr.Textbox( label="Resumen", - lines=20, - max_lines=50 + lines=15, + max_lines=30 + ) + + with gr.TabItem("🔧 JSON Completo"): + json_output = gr.Code( + label="Datos Completos", + language="json", + lines=20 ) - - process_btn.click( - process_file, - inputs=[file_input], - outputs=[result_file, result_status, html_output, text_output] - ) - # Ejemplos - gr.Markdown("### 📖 Ejemplos") - gr.Examples( - examples=[ - ["ejemplo_referencias.txt"], - ["ejemplo_bibliografia.pdf"], - ["paper_con_referencias.docx"] - ], - inputs=[file_input], - label="Archivos de ejemplo (necesitan ser creados)" + # Conectar eventos + init_btn.click( + initialize_system, + inputs=[provider, model, nebius_key, nebius_base, openai_key], + outputs=init_status + ) + + process_btn.click( + process_document, + inputs=[file_input, use_nebius], + outputs=[result_file, result_status, html_output, text_output, json_output, stats_display] + ) + + stats_btn.click( + get_system_stats, + outputs=system_stats ) # Información gr.Markdown(""" - ### 📌 Información - - **Formatos soportados**: TXT, PDF, DOCX, HTML, MD, RTF - - **Recursos detectados**: DOI, ISBN, arXiv, PMID, URLs académicas - - **Salida**: Archivo ZIP con reportes y documentos descargados - - ### ⚠️ Notas - 1. Necesitas una API key válida para el proveedor seleccionado - 2. Los archivos grandes pueden tardar varios minutos - 3. La precisión depende del modelo de IA utilizado + ### 📌 Características Nebius AI + + **🔍 Extracción Inteligente:** + - Identificación contextual de referencias + - Corrección automática de identificadores + - Clasificación por tipo de recurso + + **✅ Verificación Avanzada:** + - Análisis de accesibilidad + - Detección de acceso abierto + - Sugerencias de fuentes alternativas + + **📊 Reportes Mejorados:** + - Métricas de confianza Nebius + - Análisis de calidad por referencia + - Estadísticas de uso de IA + + ### ⚠️ Notas Importantes + + 1. La API de Nebius requiere una key válida + 2. Los archivos grandes pueden consumir más tokens + 3. Se recomienda usar Nebius para máxima precisión + 4. Mantén tu API key segura y no la compartas + + ### 🔗 Recursos + + • [Documentación Nebius AI](https://docs.nebius.com) + • [Obtener API Key](https://studio.nebius.com) + • [Soporte Técnico](https://support.nebius.com) """) return interface @@ -1385,19 +1335,20 @@ async def main(): """Función principal""" import argparse - parser = argparse.ArgumentParser(description="Sistema de Recopilación Bibliográfica") + parser = argparse.ArgumentParser(description="Sistema Nebius de Recopilación Bibliográfica") parser.add_argument("--mode", choices=["gui", "cli"], default="gui", help="Modo de ejecución") parser.add_argument("--file", type=str, help="Archivo a procesar (modo CLI)") - parser.add_argument("--provider", default="openai", help="Proveedor de IA") - parser.add_argument("--model", default="gpt-4", help="Modelo de IA") - parser.add_argument("--api-key", help="API Key") + parser.add_argument("--nebius-key", help="API Key de Nebius") + parser.add_argument("--model", default="neural-chat-7b-v3-1", help="Modelo Nebius") + parser.add_argument("--api-base", default="https://api.studio.nebius.com/v1", + help="URL base de Nebius API") args = parser.parse_args() if args.mode == "gui": # Ejecutar interfaz Gradio - interface = create_gradio_interface() + interface = create_nebius_interface() interface.launch( server_name="0.0.0.0", server_port=7860, @@ -1415,21 +1366,28 @@ async def main(): print(f"❌ Error: Archivo no encontrado: {args.file}") return + if not args.nebius_key: + print("⚠️ Advertencia: No se proporcionó API Key de Nebius") + use_nebius = False + nebius_key = None + else: + use_nebius = True + nebius_key = args.nebius_key + # Configurar sistema config = { - "provider": args.provider, - "model_id": args.model, - "api_key": args.api_key or os.getenv(f"{args.provider.upper()}_API_KEY") + "llm_provider": "nebius" if use_nebius else "openai", + "llm_model": args.model, + "nebius_api_key": nebius_key, + "nebius_api_base": args.api_base, + "use_nebius": use_nebius } - if not config["api_key"]: - print(f"❌ Error: Necesitas especificar una API key") - return - - system = BibliographyProcessingSystem(config) + system = NebiusBibliographySystem(config) print(f"🔍 Procesando archivo: {args.file}") - print("⏳ Esto puede tardar varios minutos...") + print(f"🤖 Nebius AI: {'Activado' if use_nebius else 'Desactivado'}") + print("⏳ Procesando...") result = await system.process_document(args.file) @@ -1440,12 +1398,17 @@ async def main(): summary = result.get("summary", {}) print(f""" 📈 Resultados: - - Referencias encontradas: {summary.get('citations_found', 0)} - - Recursos verificados: {summary.get('resources_verified', 0)} - - Archivos descargados: {summary.get('files_downloaded', 0)} - - Tiempo de procesamiento: {summary.get('processing_time', 0):.2f}s + - Referencias encontradas: {summary.get('references_found', 0)} + - Referencias verificadas: {summary.get('verified', 0)} + - Archivos descargados: {summary.get('downloaded', 0)} + - Tasa de éxito: {summary.get('success_rate', '0%')} + - Tiempo de procesamiento: {summary.get('processing_time', '0s')} + + 📦 Paquete de resultados: {result.get('zip_path')} - 📦 Archivo ZIP con resultados: {result.get('zip_path')} + 📊 Estadísticas Nebius: + - Llamadas API: {result.get('report', {}).get('nebius_usage', {}).get('calls', 0)} + - Referencias mejoradas: {result.get('report', {}).get('nebius_usage', {}).get('enhanced_references', 0)} """) else: print(f"❌ Error: {result.get('error')}")