FreeBibTec2

Sleeping

App Files Files Community

FreeBibTec2 / app.py

C2MV

Update app.py

127ed20 verified 18 days ago

raw

history blame

56 kB

	import os
	import re
	import json
	import logging
	import zipfile
	import asyncio
	import tempfile
	from typing import Dict, List, Optional, Any, Tuple
	from dataclasses import dataclass, field
	from pathlib import Path
	from datetime import datetime
	import gradio as gr
	from enum import Enum
	import hashlib
	import urllib.parse
	import aiohttp

	# Importar smolagents
	from smolagents import CodeAgent, ToolCallingAgent, LiteLLMModel
	from smolagents.tools import Tool, tool
	from pydantic import BaseModel, Field

	# Configuración de logging
	logging.basicConfig(
	level=logging.INFO,
	format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
	handlers=[
	logging.FileHandler('bibliography_nebius.log'),
	logging.StreamHandler()
	]
	)
	logger = logging.getLogger(__name__)

	# ========== CONFIGURACIÓN NEBIUS API ==========

	class NebiusAPI:
	"""Cliente para API de Nebius AI"""

	def __init__(self, api_key: str, base_url: str = "https://api.studio.nebius.com"):
	self.api_key = api_key
	self.base_url = base_url
	self.headers = {
	"Authorization": f"Bearer {api_key}",
	"Content-Type": "application/json"
	}

	async def generate_text(self, prompt: str, model: str = "neural-chat-7b-v3-1",
	max_tokens: int = 1000, temperature: float = 0.7) -> str:
	"""Generar texto usando modelos de Nebius"""
	url = f"{self.base_url}/v1/chat/completions"

	payload = {
	"model": model,
	"messages": [
	{"role": "user", "content": prompt}
	],
	"max_tokens": max_tokens,
	"temperature": temperature,
	"top_p": 0.95
	}

	try:
	async with aiohttp.ClientSession() as session:
	async with session.post(
	url,
	headers=self.headers,
	json=payload,
	timeout=30
	) as response:
	if response.status == 200:
	data = await response.json()
	return data.get("choices", [{}])[0].get("message", {}).get("content", "")
	else:
	error_text = await response.text()
	logger.error(f"Nebius API error {response.status}: {error_text}")
	return ""
	except Exception as e:
	logger.error(f"Error calling Nebius API: {e}")
	return ""

	async def extract_references(self, text: str) -> List[Dict[str, Any]]:
	"""Usar Nebius para extraer referencias de texto"""
	prompt = f"""Analiza el siguiente texto y extrae todas las referencias bibliográficas.
	Identifica DOIs, ISBNs, URLs académicas, arXiv IDs y otras referencias académicas.

	Texto:
	{text[:5000]} # Limitar tamaño

	Devuelve un JSON con el siguiente formato:
	{{
	"references": [
	{{
	"type": "doi\|isbn\|arxiv\|url\|pmid\|other",
	"identifier": "identificador_completo",
	"raw_text": "texto_original_encontrado",
	"confidence": 0.0-1.0,
	"context": "texto_alrededor_del_identificador"
	}}
	]
	}}

	Solo devuelve el JSON, sin texto adicional."""

	response = await self.generate_text(prompt, max_tokens=2000)

	try:
	# Buscar JSON en la respuesta
	json_match = re.search(r'\{.*\}', response, re.DOTALL)
	if json_match:
	data = json.loads(json_match.group())
	return data.get("references", [])
	except Exception as e:
	logger.error(f"Error parsing Nebius response: {e}")

	return []

	async def verify_reference(self, reference: Dict[str, Any]) -> Dict[str, Any]:
	"""Verificar una referencia usando Nebius"""
	prompt = f"""Verifica la siguiente referencia académica y proporciona información sobre su accesibilidad:

	Tipo: {reference.get('type')}
	Identificador: {reference.get('identifier')}
	Contexto: {reference.get('context', 'No disponible')}

	Analiza:
	1. ¿Es un identificador válido?
	2. ¿Dónde podría encontrarse este recurso?
	3. ¿Es probable que esté disponible en acceso abierto?
	4. Proporciona posibles URLs para acceder al recurso.

	Devuelve un JSON con el siguiente formato:
	{{
	"valid": true/false,
	"confidence": 0.0-1.0,
	"sources": ["lista", "de", "posibles", "fuentes"],
	"likely_open_access": true/false,
	"suggested_urls": ["url1", "url2"],
	"notes": "notas_adicionales"
	}}"""

	response = await self.generate_text(prompt, max_tokens=1000)

	try:
	json_match = re.search(r'\{.*\}', response, re.DOTALL)
	if json_match:
	return json.loads(json_match.group())
	except Exception as e:
	logger.error(f"Error parsing verification response: {e}")

	return {"valid": False, "confidence": 0.0, "sources": [], "notes": "Error en verificación"}

	# ========== MODELOS DE DATOS ==========

	class ResourceType(str, Enum):
	DOI = "doi"
	ISBN = "isbn"
	ARXIV = "arxiv"
	URL = "url"
	PMID = "pmid"
	BIBTEX = "bibtex"
	CITATION = "citation"
	UNKNOWN = "unknown"

	class CitationModel(BaseModel):
	id: str
	raw_text: str
	resource_type: ResourceType
	identifier: str
	metadata: Dict[str, Any] = Field(default_factory=dict)
	confidence: float = 0.0
	extracted_from: str
	position: Tuple[int, int] = (0, 0)
	nebius_verified: bool = False
	nebius_confidence: float = 0.0

	class VerificationResult(BaseModel):
	citation: CitationModel
	verified: bool
	verification_source: str
	download_url: Optional[str]
	file_format: Optional[str]
	file_size: Optional[int]
	quality_score: float
	notes: List[str] = Field(default_factory=list)
	nebius_analysis: Optional[Dict[str, Any]] = None

	class ProcessingReport(BaseModel):
	input_file: str
	total_citations: int
	verified_resources: List[VerificationResult]
	downloaded_files: List[str]
	failed_verifications: List[CitationModel]
	processing_time: float
	summary: Dict[str, Any] = Field(default_factory=dict)
	timestamp: str = Field(default_factory=lambda: datetime.now().isoformat())
	nebius_usage: Dict[str, Any] = Field(default_factory=dict)

	# ========== HERRAMIENTAS CON INTEGRACIÓN NEBIUS ==========

	class NebiusEnhancedExtractionTool(Tool):
	name = "nebius_extract_references"
	description = """
	Extract bibliographic references using Nebius AI for enhanced accuracy.

	Args:
	text (str): Text to analyze
	nebius_api_key (str): Nebius API key
	use_ai_enhancement (bool): Whether to use Nebius AI for enhancement

	Returns:
	List[Dict]: Extracted references with Nebius AI analysis
	"""

	def __init__(self):
	super().__init__()
	# Patrones básicos para extracción inicial
	self.patterns = {
	ResourceType.DOI: [
	r'\b10\.\d{4,9}/[-._;()/:A-Z0-9]+\b',
	r'doi:\s*(10\.\d{4,9}/[-._;()/:A-Z0-9]+)',
	],
	ResourceType.ISBN: [
	r'ISBN(?:-1[03])?:?\s*(?=[0-9X]{10})(?:97[89][- ]?)?[0-9]{1,5}[- ]?[0-9]+[- ]?[0-9]+[- ]?[0-9X]',
	],
	ResourceType.ARXIV: [
	r'arXiv:\s*(\d{4}\.\d{4,5}(v\d+)?)',
	r'arxiv:\s*([a-z\-]+/\d{7})'
	],
	}

	def forward(self, text: str, nebius_api_key: str = None,
	use_ai_enhancement: bool = False) -> List[Dict[str, Any]]:
	"""Extraer referencias con opción de mejora con Nebius"""
	# Extracción básica
	basic_references = self._extract_basic(text)

	if not use_ai_enhancement or not nebius_api_key:
	return basic_references

	# Mejora con Nebius AI
	try:
	nebius = NebiusAPI(nebius_api_key)

	# Usar asyncio en contexto síncrono
	import nest_asyncio
	nest_asyncio.apply()

	# Extraer con Nebius
	loop = asyncio.new_event_loop()
	asyncio.set_event_loop(loop)
	nebius_references = loop.run_until_complete(
	nebius.extract_references(text[:10000]) # Limitar para API
	)
	loop.close()

	# Combinar resultados
	enhanced_references = self._merge_references(basic_references, nebius_references)
	return enhanced_references

	except Exception as e:
	logger.error(f"Error using Nebius enhancement: {e}")
	return basic_references

	def _extract_basic(self, text: str) -> List[Dict[str, Any]]:
	"""Extracción básica de referencias"""
	references = []

	for resource_type, patterns in self.patterns.items():
	for pattern in patterns:
	matches = re.finditer(pattern, text, re.IGNORECASE)
	for match in matches:
	identifier = match.group(1) if match.groups() else match.group(0)
	identifier = self._clean_identifier(identifier, resource_type)

	if identifier:
	reference = {
	"id": hashlib.md5(identifier.encode()).hexdigest()[:12],
	"raw_text": match.group(0),
	"type": resource_type.value,
	"identifier": identifier,
	"confidence": 0.8,
	"context": self._get_context(text, match.start(), match.end()),
	"position": (match.start(), match.end()),
	"extraction_method": "regex"
	}
	references.append(reference)

	return references

	def _merge_references(self, basic: List[Dict], nebius: List[Dict]) -> List[Dict]:
	"""Combinar referencias de extracción básica y Nebius"""
	merged = basic.copy()

	for nebius_ref in nebius:
	# Verificar si ya existe
	exists = False
	for ref in merged:
	if ref.get('identifier') == nebius_ref.get('identifier'):
	exists = True
	# Actualizar confianza y metadata
	ref['confidence'] = max(ref.get('confidence', 0),
	nebius_ref.get('confidence', 0))
	ref['extraction_method'] = 'regex+nebius'
	break

	if not exists:
	# Convertir formato Nebius a nuestro formato
	new_ref = {
	"id": hashlib.md5(
	nebius_ref.get('identifier', '').encode()
	).hexdigest()[:12],
	"raw_text": nebius_ref.get('raw_text', ''),
	"type": nebius_ref.get('type', 'unknown'),
	"identifier": nebius_ref.get('identifier', ''),
	"confidence": nebius_ref.get('confidence', 0.7),
	"context": nebius_ref.get('context', ''),
	"position": (0, 0),
	"extraction_method": 'nebius'
	}
	merged.append(new_ref)

	return merged

	def _clean_identifier(self, identifier: str, resource_type: ResourceType) -> str:
	"""Limpiar identificador"""
	identifier = identifier.strip()

	prefixes = ['doi:', 'DOI:', 'arxiv:', 'arXiv:', 'isbn:', 'ISBN:', 'pmid:', 'PMID:']
	for prefix in prefixes:
	if identifier.startswith(prefix):
	identifier = identifier[len(prefix):].strip()

	identifier = identifier.strip('"\'<>()[]{}')

	if resource_type == ResourceType.URL:
	if not identifier.startswith(('http://', 'https://')):
	identifier = f'https://{identifier}'

	return identifier

	def _get_context(self, text: str, start: int, end: int, window: int = 100) -> str:
	"""Obtener contexto alrededor del match"""
	context_start = max(0, start - window)
	context_end = min(len(text), end + window)
	return text[context_start:context_end]

	class NebiusVerificationTool(Tool):
	name = "nebius_verify_reference"
	description = """
	Verify academic references using Nebius AI analysis.

	Args:
	reference (Dict): Reference to verify
	nebius_api_key (str): Nebius API key
	deep_verify (bool): Whether to perform deep verification

	Returns:
	Dict: Verification results with Nebius analysis
	"""

	def __init__(self):
	super().__init__()
	self.headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
	}

	def forward(self, reference: Dict[str, Any], nebius_api_key: str = None,
	deep_verify: bool = False) -> Dict[str, Any]:
	"""Verificar referencia con Nebius"""
	result = {
	"reference": reference,
	"verified": False,
	"verification_source": "direct",
	"download_url": None,
	"file_format": None,
	"file_size": None,
	"quality_score": 0.0,
	"notes": [],
	"nebius_analysis": None
	}

	# Verificación directa primero
	direct_result = self._direct_verification(reference)
	if direct_result.get("verified"):
	result.update(direct_result)
	result["quality_score"] = 0.9

	# Verificación con Nebius si está disponible
	if nebius_api_key and deep_verify:
	nebius_result = self._nebius_verification(reference, nebius_api_key)
	result["nebius_analysis"] = nebius_result

	if nebius_result.get("valid", False):
	result["verified"] = True
	result["verification_source"] = "nebius"
	result["quality_score"] = max(
	result.get("quality_score", 0),
	nebius_result.get("confidence", 0)
	)

	# Agregar URLs sugeridas por Nebius
	suggested_urls = nebius_result.get("suggested_urls", [])
	if suggested_urls and not result.get("download_url"):
	result["download_url"] = suggested_urls[0]

	result["notes"].append(
	f"Nebius analysis: {nebius_result.get('notes', 'No notes')}"
	)

	return result

	def _direct_verification(self, reference: Dict[str, Any]) -> Dict[str, Any]:
	"""Verificación directa de la referencia"""
	import requests

	ref_type = reference.get("type", "")
	identifier = reference.get("identifier", "")

	try:
	if ref_type == "doi":
	return self._verify_doi(identifier)
	elif ref_type == "arxiv":
	return self._verify_arxiv(identifier)
	elif ref_type == "url":
	return self._verify_url(identifier)
	elif ref_type == "isbn":
	return self._verify_isbn(identifier)
	except Exception as e:
	logger.error(f"Direct verification error: {e}")

	return {"verified": False, "notes": [f"Direct verification failed for {ref_type}"]}

	def _verify_doi(self, doi: str) -> Dict[str, Any]:
	"""Verificar DOI"""
	import requests

	try:
	# Crossref
	url = f"https://api.crossref.org/works/{doi}"
	response = requests.get(url, headers=self.headers, timeout=10)

	if response.status_code == 200:
	data = response.json()
	work = data.get('message', {})

	result = {"verified": True, "notes": ["Verified via Crossref"]}

	# Buscar PDF
	links = work.get('link', [])
	for link in links:
	if link.get('content-type') == 'application/pdf':
	result["download_url"] = link.get('URL')
	result["file_format"] = "pdf"
	break

	return result
	except Exception as e:
	logger.error(f"DOI verification error: {e}")

	return {"verified": False}

	def _verify_arxiv(self, arxiv_id: str) -> Dict[str, Any]:
	"""Verificar arXiv ID"""
	import requests

	try:
	# Limpiar ID
	if 'arxiv:' in arxiv_id.lower():
	arxiv_id = arxiv_id.split(':')[-1].strip()

	# Verificar existencia
	api_url = f"http://export.arxiv.org/api/query?id_list={arxiv_id}"
	response = requests.get(api_url, headers=self.headers, timeout=10)

	if response.status_code == 200:
	return {
	"verified": True,
	"download_url": f"https://arxiv.org/pdf/{arxiv_id}.pdf",
	"file_format": "pdf",
	"notes": ["arXiv paper available"]
	}
	except Exception as e:
	logger.error(f"arXiv verification error: {e}")

	return {"verified": False}

	def _verify_url(self, url: str) -> Dict[str, Any]:
	"""Verificar URL"""
	import requests

	try:
	response = requests.head(url, headers=self.headers, timeout=10, allow_redirects=True)

	if response.status_code == 200:
	result = {"verified": True, "notes": [f"URL accessible: {response.status_code}"]}

	# Verificar si es PDF
	content_type = response.headers.get('content-type', '')
	if 'application/pdf' in content_type:
	result["download_url"] = url
	result["file_format"] = "pdf"

	return result
	except Exception as e:
	logger.error(f"URL verification error: {e}")

	return {"verified": False}

	def _verify_isbn(self, isbn: str) -> Dict[str, Any]:
	"""Verificar ISBN"""
	import requests

	try:
	# Open Library
	url = f"https://openlibrary.org/api/books?bibkeys=ISBN:{isbn}&format=json"
	response = requests.get(url, headers=self.headers, timeout=10)

	if response.status_code == 200:
	data = response.json()
	if data:
	return {
	"verified": True,
	"notes": ["ISBN found in Open Library"]
	}
	except Exception as e:
	logger.error(f"ISBN verification error: {e}")

	return {"verified": False}

	def _nebius_verification(self, reference: Dict[str, Any], api_key: str) -> Dict[str, Any]:
	"""Verificación con Nebius AI"""
	try:
	nebius = NebiusAPI(api_key)

	# Usar asyncio en contexto síncrono
	import nest_asyncio
	nest_asyncio.apply()

	loop = asyncio.new_event_loop()
	asyncio.set_event_loop(loop)
	analysis = loop.run_until_complete(
	nebius.verify_reference(reference)
	)
	loop.close()

	return analysis

	except Exception as e:
	logger.error(f"Nebius verification error: {e}")
	return {"valid": False, "confidence": 0.0, "notes": f"Error: {str(e)}"}

	# ========== SISTEMA PRINCIPAL CON NEBIUS ==========

	class NebiusBibliographySystem:
	"""Sistema de procesamiento bibliográfico con Nebius AI"""

	def __init__(self, config: Dict[str, Any]):
	self.config = config
	self.nebius_api_key = config.get("nebius_api_key")
	self.use_nebius = bool(self.nebius_api_key)

	# Inicializar herramientas
	self.extraction_tool = NebiusEnhancedExtractionTool()
	self.verification_tool = NebiusVerificationTool()

	# Configurar modelo LiteLLM para agentes
	self.llm_model = self._configure_llm()

	# Directorios de salida
	self.output_base = "nebius_bibliography"
	self.download_dir = os.path.join(self.output_base, "downloads")
	self.report_dir = os.path.join(self.output_base, "reports")
	self.log_dir = os.path.join(self.output_base, "logs")

	# Crear directorios
	for dir_path in [self.output_base, self.download_dir, self.report_dir, self.log_dir]:
	os.makedirs(dir_path, exist_ok=True)

	# Estadísticas
	self.stats = {
	"total_processed": 0,
	"total_references": 0,
	"nebius_calls": 0,
	"success_rate": 0.0
	}

	logger.info(f"Nebius system initialized. Nebius AI: {'Enabled' if self.use_nebius else 'Disabled'}")

	def _configure_llm(self):
	"""Configurar modelo LiteLLM"""
	provider = self.config.get("llm_provider", "openai")

	if provider == "nebius" and self.nebius_api_key:
	# Configurar Nebius como proveedor personalizado
	return LiteLLMModel(
	model_id=self.config.get("llm_model", "neural-chat-7b-v3-1"),
	api_key=self.nebius_api_key,
	api_base=self.config.get("nebius_api_base", "https://api.studio.nebius.com/v1")
	)
	elif provider == "openai":
	return LiteLLMModel(
	model_id=self.config.get("llm_model", "gpt-4"),
	api_key=self.config.get("openai_api_key")
	)
	else:
	# Default to Nebius if available
	if self.nebius_api_key:
	return LiteLLMModel(
	model_id="neural-chat-7b-v3-1",
	api_key=self.nebius_api_key,
	api_base="https://api.studio.nebius.com/v1"
	)
	else:
	return LiteLLMModel(model_id="gpt-4")

	async def process_document(self, file_path: str, process_id: str = None) -> Dict[str, Any]:
	"""Procesar documento completo con Nebius"""
	import time
	start_time = time.time()

	# Generar ID de proceso
	process_id = process_id or self._generate_process_id(file_path)

	logger.info(f"[{process_id}] Processing document: {file_path}")

	try:
	# 1. Leer archivo
	file_content = self._read_file(file_path)
	if not file_content:
	return self._error_result(process_id, "Empty or unreadable file")

	# 2. Extraer referencias
	logger.info(f"[{process_id}] Extracting references...")
	references = self.extraction_tool.forward(
	text=file_content,
	nebius_api_key=self.nebius_api_key,
	use_ai_enhancement=self.use_nebius
	)

	if self.use_nebius:
	self.stats["nebius_calls"] += 1

	self.stats["total_references"] += len(references)
	logger.info(f"[{process_id}] Found {len(references)} references")

	# 3. Verificar referencias
	logger.info(f"[{process_id}] Verifying references...")
	verification_results = []
	failed_verifications = []

	for i, ref in enumerate(references):
	if i % 5 == 0: # Log cada 5 referencias
	logger.info(f"[{process_id}] Verified {i}/{len(references)}")

	# Verificar referencia
	verification = self.verification_tool.forward(
	reference=ref,
	nebius_api_key=self.nebius_api_key,
	deep_verify=self.use_nebius
	)

	if verification.get("verified"):
	# Convertir a modelo
	citation = CitationModel(
	id=ref.get("id"),
	raw_text=ref.get("raw_text", ""),
	resource_type=ResourceType(ref.get("type", "unknown")),
	identifier=ref.get("identifier", ""),
	confidence=ref.get("confidence", 0.0),
	extracted_from=file_path,
	position=ref.get("position", (0, 0)),
	nebius_verified=self.use_nebius,
	nebius_confidence=verification.get("quality_score", 0.0)
	)

	vr = VerificationResult(
	citation=citation,
	verified=True,
	verification_source=verification.get("verification_source", "unknown"),
	download_url=verification.get("download_url"),
	file_format=verification.get("file_format"),
	file_size=verification.get("file_size"),
	quality_score=verification.get("quality_score", 0.0),
	notes=verification.get("notes", []),
	nebius_analysis=verification.get("nebius_analysis")
	)
	verification_results.append(vr)
	else:
	# Referencia fallida
	citation = CitationModel(
	id=ref.get("id"),
	raw_text=ref.get("raw_text", ""),
	resource_type=ResourceType(ref.get("type", "unknown")),
	identifier=ref.get("identifier", ""),
	confidence=ref.get("confidence", 0.0),
	extracted_from=file_path,
	position=ref.get("position", (0, 0)),
	nebius_verified=False,
	nebius_confidence=0.0
	)
	failed_verifications.append(citation)

	# 4. Descargar archivos verificados
	logger.info(f"[{process_id}] Downloading files...")
	downloaded_files = await self._download_files(
	verification_results,
	process_id
	)

	# 5. Generar reporte
	processing_time = time.time() - start_time

	report = ProcessingReport(
	input_file=file_path,
	total_citations=len(references),
	verified_resources=verification_results,
	downloaded_files=downloaded_files,
	failed_verifications=failed_verifications,
	processing_time=processing_time,
	summary={
	"success_rate": len(verification_results) / max(1, len(references)),
	"download_rate": len(downloaded_files) / max(1, len(verification_results)),
	"avg_quality": sum(vr.quality_score for vr in verification_results) / max(1, len(verification_results))
	},
	nebius_usage={
	"enabled": self.use_nebius,
	"calls": self.stats["nebius_calls"],
	"enhanced_references": sum(1 for vr in verification_results if vr.nebius_analysis)
	}
	)

	# 6. Guardar resultados
	self._save_results(report, process_id)

	self.stats["total_processed"] += 1
	self.stats["success_rate"] = report.summary.get("success_rate", 0.0)

	logger.info(f"[{process_id}] Processing completed in {processing_time:.2f}s")

	return {
	"success": True,
	"process_id": process_id,
	"report": report.dict(),
	"zip_path": self._create_zip(report, process_id),
	"summary": {
	"references_found": len(references),
	"verified": len(verification_results),
	"downloaded": len(downloaded_files),
	"success_rate": f"{report.summary.get('success_rate', 0) * 100:.1f}%",
	"processing_time": f"{processing_time:.2f}s"
	}
	}

	except Exception as e:
	logger.error(f"[{process_id}] Processing error: {e}")
	return self._error_result(process_id, str(e))

	def _read_file(self, file_path: str) -> str:
	"""Leer contenido del archivo"""
	try:
	with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
	return f.read()
	except Exception as e:
	logger.error(f"Error reading file {file_path}: {e}")
	return ""

	async def _download_files(self, verification_results: List[VerificationResult],
	process_id: str) -> List[str]:
	"""Descargar archivos de URLs verificadas"""
	downloaded_files = []

	for i, vr in enumerate(verification_results):
	if vr.download_url:
	try:
	file_path = await self._download_file(
	vr.download_url,
	vr.citation.identifier,
	process_id,
	i
	)
	if file_path:
	downloaded_files.append(file_path)
	except Exception as e:
	logger.error(f"Download failed for {vr.citation.identifier}: {e}")

	return downloaded_files

	async def _download_file(self, url: str, identifier: str,
	process_id: str, index: int) -> Optional[str]:
	"""Descargar un archivo individual"""
	import aiohttp

	try:
	# Crear nombre de archivo seguro
	safe_name = re.sub(r'[^\w\-\.]', '_', identifier)
	if len(safe_name) > 100:
	safe_name = safe_name[:100]

	# Determinar extensión
	extension = self._get_extension_from_url(url)
	if not extension:
	extension = ".pdf" # Default

	filename = f"{process_id}_{index:03d}_{safe_name}{extension}"
	filepath = os.path.join(self.download_dir, filename)

	# Descargar
	timeout = aiohttp.ClientTimeout(total=60)
	async with aiohttp.ClientSession(timeout=timeout) as session:
	async with session.get(url, headers={'User-Agent': 'Mozilla/5.0'}) as response:
	if response.status == 200:
	content = await response.read()

	# Verificar que sea un archivo válido
	if len(content) > 100: # Archivo no vacío
	with open(filepath, 'wb') as f:
	f.write(content)

	logger.info(f"Downloaded: {filename} ({len(content)} bytes)")
	return filepath

	return None

	except Exception as e:
	logger.error(f"Download error for {url}: {e}")
	return None

	def _get_extension_from_url(self, url: str) -> str:
	"""Obtener extensión de archivo desde URL"""
	url_lower = url.lower()

	if '.pdf' in url_lower:
	return '.pdf'
	elif '.docx' in url_lower or '.doc' in url_lower:
	return '.docx'
	elif '.html' in url_lower or '.htm' in url_lower:
	return '.html'
	elif '.txt' in url_lower:
	return '.txt'
	elif '.epub' in url_lower:
	return '.epub'

	return ""

	def _generate_process_id(self, file_path: str) -> str:
	"""Generar ID único de proceso"""
	timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
	file_hash = hashlib.md5(file_path.encode()).hexdigest()[:6]
	return f"NB_{timestamp}_{file_hash}"

	def _save_results(self, report: ProcessingReport, process_id: str):
	"""Guardar resultados en disco"""
	# Guardar reporte JSON
	report_path = os.path.join(self.report_dir, f"{process_id}_report.json")
	with open(report_path, 'w', encoding='utf-8') as f:
	json.dump(report.dict(), f, indent=2, default=str)

	# Guardar resumen en texto
	summary_path = os.path.join(self.report_dir, f"{process_id}_summary.txt")
	with open(summary_path, 'w', encoding='utf-8') as f:
	f.write(self._generate_text_summary(report))

	def _create_zip(self, report: ProcessingReport, process_id: str) -> str:
	"""Crear archivo ZIP con resultados"""
	import zipfile

	zip_path = os.path.join(self.output_base, f"{process_id}_results.zip")

	with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
	# Agregar reportes
	report_files = [
	f for f in os.listdir(self.report_dir)
	if f.startswith(process_id)
	]

	for file in report_files:
	filepath = os.path.join(self.report_dir, file)
	zipf.write(filepath, f"reports/{file}")

	# Agregar archivos descargados
	for file_path in report.downloaded_files:
	if os.path.exists(file_path):
	filename = os.path.basename(file_path)
	zipf.write(file_path, f"downloads/{filename}")

	# Agregar log
	log_path = os.path.join(self.log_dir, f"{process_id}_log.txt")
	with open(log_path, 'w') as f:
	f.write(f"Process ID: {process_id}\n")
	f.write(f"Time: {datetime.now().isoformat()}\n")
	f.write(f"Success rate: {report.summary.get('success_rate', 0) * 100:.1f}%\n")

	zipf.write(log_path, "process_log.txt")

	return zip_path

	def _generate_text_summary(self, report: ProcessingReport) -> str:
	"""Generar resumen en texto"""
	summary = f"""
	NEBIUS BIBLIOGRAPHY PROCESSING REPORT
	=====================================

	Process ID: Generated automatically
	Input File: {report.input_file}
	Processing Time: {report.processing_time:.2f} seconds
	Timestamp: {report.timestamp}

	SUMMARY STATISTICS
	------------------
	Total References Found: {report.total_citations}
	Successfully Verified: {len(report.verified_resources)}
	Files Downloaded: {len(report.downloaded_files)}
	Verification Success Rate: {report.summary.get('success_rate', 0) * 100:.1f}%
	Average Quality Score: {report.summary.get('avg_quality', 0):.2f}

	NEBIUS AI USAGE
	---------------
	Enabled: {report.nebius_usage.get('enabled', False)}
	API Calls: {report.nebius_usage.get('calls', 0)}
	Enhanced References: {report.nebius_usage.get('enhanced_references', 0)}

	VERIFIED RESOURCES (Top 10)
	---------------------------
	"""

	for i, vr in enumerate(report.verified_resources[:10], 1):
	summary += f"\n{i}. {vr.citation.identifier}"
	summary += f"\n Type: {vr.citation.resource_type.value}"
	summary += f"\n Source: {vr.verification_source}"
	summary += f"\n Quality: {vr.quality_score:.2f}"
	summary += f"\n Nebius Enhanced: {vr.citation.nebius_verified}"
	if vr.download_url:
	summary += f"\n Downloaded: Yes"
	summary += "\n"

	if report.failed_verifications:
	summary += f"\nFAILED VERIFICATIONS ({len(report.failed_verifications)})\n"
	summary += "-" * 40 + "\n"
	for citation in report.failed_verifications[:5]:
	summary += f"- {citation.identifier} ({citation.resource_type.value})\n"

	summary += f"\nFILES DOWNLOADED\n"
	summary += "-" * 40 + "\n"
	for file_path in report.downloaded_files:
	if os.path.exists(file_path):
	file_size = os.path.getsize(file_path)
	summary += f"- {os.path.basename(file_path)} ({file_size} bytes)\n"

	return summary

	def _error_result(self, process_id: str, error: str) -> Dict[str, Any]:
	"""Generar resultado de error"""
	return {
	"success": False,
	"process_id": process_id,
	"error": error,
	"timestamp": datetime.now().isoformat()
	}

	def get_stats(self) -> Dict[str, Any]:
	"""Obtener estadísticas del sistema"""
	return {
	"total_processed": self.stats["total_processed"],
	"total_references": self.stats["total_references"],
	"nebius_calls": self.stats["nebius_calls"],
	"success_rate": self.stats["success_rate"],
	"output_directory": self.output_base
	}

	# ========== INTERFAZ GRADIO MEJORADA ==========

	def create_nebius_interface():
	"""Crear interfaz Gradio con soporte para Nebius"""

	system = None
	current_process = None

	def initialize_system(provider, model, nebius_key, nebius_base, openai_key):
	"""Inicializar sistema con configuración"""
	nonlocal system

	config = {
	"llm_provider": provider,
	"llm_model": model,
	"nebius_api_key": nebius_key,
	"nebius_api_base": nebius_base or "https://api.studio.nebius.com/v1",
	"openai_api_key": openai_key,
	"use_nebius": bool(nebius_key)
	}

	try:
	system = NebiusBibliographySystem(config)
	return "✅ Sistema inicializado con Nebius AI" if nebius_key else "✅ Sistema inicializado (sin Nebius)"
	except Exception as e:
	return f"❌ Error: {str(e)}"

	async def process_document(file_obj, use_nebius, progress=gr.Progress()):
	"""Procesar documento"""
	nonlocal system, current_process

	if not system:
	return None, "❌ Sistema no inicializado", "", "", ""

	try:
	progress(0, desc="Preparando archivo...")

	# Guardar archivo temporalmente
	import tempfile
	import shutil

	temp_dir = tempfile.mkdtemp()
	file_path = os.path.join(temp_dir, file_obj.name)
	shutil.copy(file_obj.name, file_path)

	progress(0.1, desc="Procesando con Nebius..." if use_nebius else "Procesando...")

	# Procesar documento
	result = await system.process_document(file_path)

	if not result.get("success"):
	# Limpiar temporal
	shutil.rmtree(temp_dir, ignore_errors=True)
	return None, f"❌ Error: {result.get('error')}", "", "", ""

	current_process = result.get("process_id")
	summary = result.get("summary", {})

	progress(0.9, desc="Generando reportes...")

	# Generar visualizaciones
	report_data = result.get("report", {})

	# HTML output
	html_output = self._generate_html_report(report_data)

	# Text output
	text_output = self._generate_text_report(report_data)

	# JSON output
	json_output = json.dumps(report_data, indent=2, default=str)

	# Statistics
	stats_output = self._generate_stats_display(summary)

	progress(1.0, desc="Completado!")

	# Limpiar temporal
	shutil.rmtree(temp_dir, ignore_errors=True)

	return (
	result.get("zip_path"),
	f"✅ Proceso {current_process} completado",
	html_output,
	text_output,
	json_output,
	stats_output
	)

	except Exception as e:
	logger.error(f"Processing error: {e}")
	return None, f"❌ Error: {str(e)}", "", "", "", ""

	def _generate_html_report(self, report_data: Dict) -> str:
	"""Generar reporte HTML"""
	verified = len(report_data.get("verified_resources", []))
	total = report_data.get("total_citations", 0)
	success_rate = (verified / max(1, total)) * 100

	nebius_usage = report_data.get("nebius_usage", {})

	html = f"""
	<div style="font-family: Arial, sans-serif; padding: 20px;">
	<h2 style="color: #2c3e50;">📊 Reporte de Procesamiento Nebius</h2>

	<div style="background: #ecf0f1; padding: 15px; border-radius: 10px; margin: 15px 0;">
	<h3 style="color: #34495e;">📈 Resumen General</h3>
	<div style="display: grid; grid-template-columns: repeat(2, 1fr); gap: 10px;">
	<div style="background: white; padding: 10px; border-radius: 5px;">
	<strong>Referencias Encontradas</strong><br>
	<span style="font-size: 24px; color: #3498db;">{total}</span>
	</div>
	<div style="background: white; padding: 10px; border-radius: 5px;">
	<strong>Verificadas</strong><br>
	<span style="font-size: 24px; color: #2ecc71;">{verified}</span>
	</div>
	<div style="background: white; padding: 10px; border-radius: 5px;">
	<strong>Tasa de Éxito</strong><br>
	<span style="font-size: 24px; color: #9b59b6;">{success_rate:.1f}%</span>
	</div>
	<div style="background: white; padding: 10px; border-radius: 5px;">
	<strong>Tiempo</strong><br>
	<span style="font-size: 24px; color: #e74c3c;">{report_data.get('processing_time', 0):.1f}s</span>
	</div>
	</div>
	</div>

	<div style="background: #d5f4e6; padding: 15px; border-radius: 10px; margin: 15px 0;">
	<h3 style="color: #27ae60;">🤖 Nebius AI</h3>
	<p><strong>Estado:</strong> {'✅ Activado' if nebius_usage.get('enabled') else '❌ Desactivado'}</p>
	<p><strong>Llamadas API:</strong> {nebius_usage.get('calls', 0)}</p>
	<p><strong>Referencias Mejoradas:</strong> {nebius_usage.get('enhanced_references', 0)}</p>
	</div>

	<div style="background: #e8f4fc; padding: 15px; border-radius: 10px; margin: 15px 0;">
	<h3 style="color: #2980b9;">📥 Descargas</h3>
	<p><strong>Archivos Descargados:</strong> {len(report_data.get('downloaded_files', []))}</p>
	<ul>
	"""

	for file in report_data.get("downloaded_files", [])[:5]:
	filename = os.path.basename(file)
	html += f'<li>{filename}</li>'

	html += """
	</ul>
	</div>

	<div style="background: #fdebd0; padding: 15px; border-radius: 10px; margin: 15px 0;">
	<h3 style="color: #d35400;">⚠️ Referencias No Verificadas</h3>
	<p><strong>Total:</strong> {failed}</p>
	""".format(failed=len(report_data.get("failed_verifications", [])))

	html += """
	</div>
	</div>
	"""

	return html

	def _generate_text_report(self, report_data: Dict) -> str:
	"""Generar reporte en texto"""
	verified = len(report_data.get("verified_resources", []))
	total = report_data.get("total_citations", 0)

	text = f"""
	REPORTE DE PROCESAMIENTO
	========================

	Archivo: {report_data.get('input_file', 'Desconocido')}
	Fecha: {report_data.get('timestamp', '')}

	ESTADÍSTICAS:
	-------------
	• Referencias encontradas: {total}
	• Referencias verificadas: {verified}
	• Archivos descargados: {len(report_data.get('downloaded_files', []))}
	• Tiempo de procesamiento: {report_data.get('processing_time', 0):.2f}s
	• Tasa de éxito: {(verified/max(1, total))*100:.1f}%

	NEBIUS AI:
	----------
	• Estado: {'Activado' if report_data.get('nebius_usage', {}).get('enabled') else 'Desactivado'}
	• Llamadas API: {report_data.get('nebius_usage', {}).get('calls', 0)}
	• Referencias mejoradas: {report_data.get('nebius_usage', {}).get('enhanced_references', 0)}

	Para más detalles, consulte el archivo ZIP con el reporte completo.
	"""

	return text

	def _generate_stats_display(self, summary: Dict) -> str:
	"""Generar display de estadísticas"""
	return f"""
	⚡ PROCESO COMPLETADO ⚡

	📊 Estadísticas Rápidas:
	• Referencias: {summary.get('references_found', 0)}
	• Verificadas: {summary.get('verified', 0)}
	• Descargadas: {summary.get('downloaded', 0)}
	• Tasa de éxito: {summary.get('success_rate', '0%')}
	• Tiempo: {summary.get('processing_time', '0s')}
	"""

	def get_system_stats():
	"""Obtener estadísticas del sistema"""
	nonlocal system

	if not system:
	return "❌ Sistema no inicializado"

	stats = system.get_stats()

	return f"""
	📈 Estadísticas del Sistema Nebius:

	• Documentos procesados: {stats.get('total_processed', 0)}
	• Referencias totales: {stats.get('total_references', 0)}
	• Llamadas Nebius API: {stats.get('nebius_calls', 0)}
	• Tasa de éxito promedio: {stats.get('success_rate', 0) * 100:.1f}%
	• Directorio de salida: {stats.get('output_directory', 'N/A')}
	"""

	# Crear interfaz
	with gr.Blocks(title="Nebius Bibliography System", theme=gr.themes.Soft()) as interface:
	gr.Markdown("# 📚 Sistema de Recopilación Bibliográfica con Nebius AI")
	gr.Markdown("Procesa documentos académicos usando Nebius AI para extracción y verificación inteligente")

	with gr.Row():
	with gr.Column(scale=1):
	gr.Markdown("### ⚙️ Configuración Nebius AI")

	provider = gr.Dropdown(
	choices=["nebius", "openai"],
	label="Proveedor de IA Principal",
	value="nebius",
	info="Selecciona Nebius para usar la API de Nebius AI"
	)

	model = gr.Textbox(
	label="Modelo",
	value="neural-chat-7b-v3-1",
	placeholder="Modelo de Nebius (ej: neural-chat-7b-v3-1)"
	)

	nebius_key = gr.Textbox(
	label="Nebius API Key",
	type="password",
	placeholder="Ingresa tu API Key de Nebius"
	)

	nebius_base = gr.Textbox(
	label="Nebius API Base (opcional)",
	value="https://api.studio.nebius.com/v1",
	placeholder="URL base de la API de Nebius"
	)

	openai_key = gr.Textbox(
	label="OpenAI API Key (respaldo)",
	type="password",
	placeholder="Opcional: Key de OpenAI como respaldo"
	)

	init_btn = gr.Button("🚀 Inicializar Sistema Nebius", variant="primary")
	init_status = gr.Markdown("")

	gr.Markdown("---")
	stats_btn = gr.Button("📊 Estadísticas del Sistema")
	system_stats = gr.Markdown("")

	with gr.Column(scale=2):
	gr.Markdown("### 📄 Procesar Documento")

	file_input = gr.File(
	label="Sube tu documento",
	file_types=[".txt", ".pdf", ".docx", ".html", ".md"]
	)

	use_nebius = gr.Checkbox(
	label="Usar Nebius AI para mejora de precisión",
	value=True
	)

	process_btn = gr.Button("🔍 Procesar con Nebius AI", variant="primary")

	gr.Markdown("### 📦 Resultados")

	result_file = gr.File(label="Descargar Paquete Completo (ZIP)")
	result_status = gr.Markdown("")
	stats_display = gr.Markdown("")

	with gr.Tabs():
	with gr.TabItem("📋 Vista HTML"):
	html_output = gr.HTML(label="Reporte Interactivo")

	with gr.TabItem("📝 Texto Plano"):
	text_output = gr.Textbox(
	label="Resumen",
	lines=15,
	max_lines=30
	)

	with gr.TabItem("🔧 JSON Completo"):
	json_output = gr.Code(
	label="Datos Completos",
	language="json",
	lines=20
	)

	# Conectar eventos
	init_btn.click(
	initialize_system,
	inputs=[provider, model, nebius_key, nebius_base, openai_key],
	outputs=init_status
	)

	process_btn.click(
	process_document,
	inputs=[file_input, use_nebius],
	outputs=[result_file, result_status, html_output, text_output, json_output, stats_display]
	)

	stats_btn.click(
	get_system_stats,
	outputs=system_stats
	)

	# Información
	gr.Markdown("""
	### 📌 Características Nebius AI

	🔍 Extracción Inteligente:
	- Identificación contextual de referencias
	- Corrección automática de identificadores
	- Clasificación por tipo de recurso

	✅ Verificación Avanzada:
	- Análisis de accesibilidad
	- Detección de acceso abierto
	- Sugerencias de fuentes alternativas

	📊 Reportes Mejorados:
	- Métricas de confianza Nebius
	- Análisis de calidad por referencia
	- Estadísticas de uso de IA

	### ⚠️ Notas Importantes

	1. La API de Nebius requiere una key válida
	2. Los archivos grandes pueden consumir más tokens
	3. Se recomienda usar Nebius para máxima precisión
	4. Mantén tu API key segura y no la compartas

	### 🔗 Recursos

	• [Documentación Nebius AI](https://docs.nebius.com)
	• [Obtener API Key](https://studio.nebius.com)
	• [Soporte Técnico](https://support.nebius.com)
	""")

	return interface

	# ========== EJECUCIÓN PRINCIPAL ==========

	async def main():
	"""Función principal"""
	import argparse

	parser = argparse.ArgumentParser(description="Sistema Nebius de Recopilación Bibliográfica")
	parser.add_argument("--mode", choices=["gui", "cli"], default="gui",
	help="Modo de ejecución")
	parser.add_argument("--file", type=str, help="Archivo a procesar (modo CLI)")
	parser.add_argument("--nebius-key", help="API Key de Nebius")
	parser.add_argument("--model", default="neural-chat-7b-v3-1", help="Modelo Nebius")
	parser.add_argument("--api-base", default="https://api.studio.nebius.com/v1",
	help="URL base de Nebius API")

	args = parser.parse_args()

	if args.mode == "gui":
	# Ejecutar interfaz Gradio
	interface = create_nebius_interface()
	interface.launch(
	server_name="0.0.0.0",
	server_port=7860,
	share=True,
	debug=True
	)

	elif args.mode == "cli":
	# Modo línea de comandos
	if not args.file:
	print("❌ Error: Debes especificar un archivo con --file")
	return

	if not os.path.exists(args.file):
	print(f"❌ Error: Archivo no encontrado: {args.file}")
	return

	if not args.nebius_key:
	print("⚠️ Advertencia: No se proporcionó API Key de Nebius")
	use_nebius = False
	nebius_key = None
	else:
	use_nebius = True
	nebius_key = args.nebius_key

	# Configurar sistema
	config = {
	"llm_provider": "nebius" if use_nebius else "openai",
	"llm_model": args.model,
	"nebius_api_key": nebius_key,
	"nebius_api_base": args.api_base,
	"use_nebius": use_nebius
	}

	system = NebiusBibliographySystem(config)

	print(f"🔍 Procesando archivo: {args.file}")
	print(f"🤖 Nebius AI: {'Activado' if use_nebius else 'Desactivado'}")
	print("⏳ Procesando...")

	result = await system.process_document(args.file)

	if result.get("success"):
	print(f"✅ Procesamiento completado!")
	print(f"📊 ID del proceso: {result.get('process_id')}")

	summary = result.get("summary", {})
	print(f"""
	📈 Resultados:
	- Referencias encontradas: {summary.get('references_found', 0)}
	- Referencias verificadas: {summary.get('verified', 0)}
	- Archivos descargados: {summary.get('downloaded', 0)}
	- Tasa de éxito: {summary.get('success_rate', '0%')}
	- Tiempo de procesamiento: {summary.get('processing_time', '0s')}

	📦 Paquete de resultados: {result.get('zip_path')}

	📊 Estadísticas Nebius:
	- Llamadas API: {result.get('report', {}).get('nebius_usage', {}).get('calls', 0)}
	- Referencias mejoradas: {result.get('report', {}).get('nebius_usage', {}).get('enhanced_references', 0)}
	""")
	else:
	print(f"❌ Error: {result.get('error')}")

	if __name__ == "__main__":
	import asyncio
	asyncio.run(main())