C2MV commited on
Commit
38e5acc
·
verified ·
1 Parent(s): 127ed20

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +586 -1208
app.py CHANGED
@@ -4,66 +4,108 @@ import json
4
  import logging
5
  import zipfile
6
  import asyncio
7
- import tempfile
8
- from typing import Dict, List, Optional, Any, Tuple
9
- from dataclasses import dataclass, field
10
- from pathlib import Path
11
  from datetime import datetime
12
  import gradio as gr
13
  from enum import Enum
14
  import hashlib
15
- import urllib.parse
16
  import aiohttp
17
 
18
- # Importar smolagents
19
- from smolagents import CodeAgent, ToolCallingAgent, LiteLLMModel
20
- from smolagents.tools import Tool, tool
21
- from pydantic import BaseModel, Field
22
-
23
  # Configuración de logging
24
  logging.basicConfig(
25
  level=logging.INFO,
26
- format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
27
- handlers=[
28
- logging.FileHandler('bibliography_nebius.log'),
29
- logging.StreamHandler()
30
- ]
31
  )
32
  logger = logging.getLogger(__name__)
33
 
34
- # ========== CONFIGURACIÓN NEBIUS API ==========
35
 
36
- class NebiusAPI:
37
- """Cliente para API de Nebius AI"""
38
 
39
- def __init__(self, api_key: str, base_url: str = "https://api.studio.nebius.com"):
40
- self.api_key = api_key
41
- self.base_url = base_url
42
- self.headers = {
43
- "Authorization": f"Bearer {api_key}",
44
- "Content-Type": "application/json"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  }
46
-
47
- async def generate_text(self, prompt: str, model: str = "neural-chat-7b-v3-1",
48
- max_tokens: int = 1000, temperature: float = 0.7) -> str:
49
- """Generar texto usando modelos de Nebius"""
50
- url = f"{self.base_url}/v1/chat/completions"
51
 
52
- payload = {
53
- "model": model,
54
- "messages": [
55
- {"role": "user", "content": prompt}
56
- ],
57
- "max_tokens": max_tokens,
58
- "temperature": temperature,
59
- "top_p": 0.95
60
  }
 
 
 
 
 
 
 
61
 
62
  try:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  async with aiohttp.ClientSession() as session:
64
  async with session.post(
65
- url,
66
- headers=self.headers,
67
  json=payload,
68
  timeout=30
69
  ) as response:
@@ -72,370 +114,145 @@ class NebiusAPI:
72
  return data.get("choices", [{}])[0].get("message", {}).get("content", "")
73
  else:
74
  error_text = await response.text()
75
- logger.error(f"Nebius API error {response.status}: {error_text}")
76
- return ""
77
- except Exception as e:
78
- logger.error(f"Error calling Nebius API: {e}")
79
- return ""
80
-
81
- async def extract_references(self, text: str) -> List[Dict[str, Any]]:
82
- """Usar Nebius para extraer referencias de texto"""
83
- prompt = f"""Analiza el siguiente texto y extrae todas las referencias bibliográficas.
84
- Identifica DOIs, ISBNs, URLs académicas, arXiv IDs y otras referencias académicas.
85
-
86
- Texto:
87
- {text[:5000]} # Limitar tamaño
88
-
89
- Devuelve un JSON con el siguiente formato:
90
- {{
91
- "references": [
92
- {{
93
- "type": "doi|isbn|arxiv|url|pmid|other",
94
- "identifier": "identificador_completo",
95
- "raw_text": "texto_original_encontrado",
96
- "confidence": 0.0-1.0,
97
- "context": "texto_alrededor_del_identificador"
98
- }}
99
- ]
100
- }}
101
-
102
- Solo devuelve el JSON, sin texto adicional."""
103
-
104
- response = await self.generate_text(prompt, max_tokens=2000)
105
-
106
- try:
107
- # Buscar JSON en la respuesta
108
- json_match = re.search(r'\{.*\}', response, re.DOTALL)
109
- if json_match:
110
- data = json.loads(json_match.group())
111
- return data.get("references", [])
112
  except Exception as e:
113
- logger.error(f"Error parsing Nebius response: {e}")
114
-
115
- return []
116
 
117
- async def verify_reference(self, reference: Dict[str, Any]) -> Dict[str, Any]:
118
- """Verificar una referencia usando Nebius"""
119
- prompt = f"""Verifica la siguiente referencia académica y proporciona información sobre su accesibilidad:
120
-
121
- Tipo: {reference.get('type')}
122
- Identificador: {reference.get('identifier')}
123
- Contexto: {reference.get('context', 'No disponible')}
124
-
125
- Analiza:
126
- 1. ¿Es un identificador válido?
127
- 2. ¿Dónde podría encontrarse este recurso?
128
- 3. ¿Es probable que esté disponible en acceso abierto?
129
- 4. Proporciona posibles URLs para acceder al recurso.
130
-
131
- Devuelve un JSON con el siguiente formato:
132
- {{
133
- "valid": true/false,
134
- "confidence": 0.0-1.0,
135
- "sources": ["lista", "de", "posibles", "fuentes"],
136
- "likely_open_access": true/false,
137
- "suggested_urls": ["url1", "url2"],
138
- "notes": "notas_adicionales"
139
- }}"""
140
-
141
- response = await self.generate_text(prompt, max_tokens=1000)
142
-
143
  try:
144
- json_match = re.search(r'\{.*\}', response, re.DOTALL)
145
- if json_match:
146
- return json.loads(json_match.group())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
  except Exception as e:
148
- logger.error(f"Error parsing verification response: {e}")
149
-
150
- return {"valid": False, "confidence": 0.0, "sources": [], "notes": "Error en verificación"}
151
-
152
- # ========== MODELOS DE DATOS ==========
153
-
154
- class ResourceType(str, Enum):
155
- DOI = "doi"
156
- ISBN = "isbn"
157
- ARXIV = "arxiv"
158
- URL = "url"
159
- PMID = "pmid"
160
- BIBTEX = "bibtex"
161
- CITATION = "citation"
162
- UNKNOWN = "unknown"
163
-
164
- class CitationModel(BaseModel):
165
- id: str
166
- raw_text: str
167
- resource_type: ResourceType
168
- identifier: str
169
- metadata: Dict[str, Any] = Field(default_factory=dict)
170
- confidence: float = 0.0
171
- extracted_from: str
172
- position: Tuple[int, int] = (0, 0)
173
- nebius_verified: bool = False
174
- nebius_confidence: float = 0.0
175
-
176
- class VerificationResult(BaseModel):
177
- citation: CitationModel
178
- verified: bool
179
- verification_source: str
180
- download_url: Optional[str]
181
- file_format: Optional[str]
182
- file_size: Optional[int]
183
- quality_score: float
184
- notes: List[str] = Field(default_factory=list)
185
- nebius_analysis: Optional[Dict[str, Any]] = None
186
-
187
- class ProcessingReport(BaseModel):
188
- input_file: str
189
- total_citations: int
190
- verified_resources: List[VerificationResult]
191
- downloaded_files: List[str]
192
- failed_verifications: List[CitationModel]
193
- processing_time: float
194
- summary: Dict[str, Any] = Field(default_factory=dict)
195
- timestamp: str = Field(default_factory=lambda: datetime.now().isoformat())
196
- nebius_usage: Dict[str, Any] = Field(default_factory=dict)
197
 
198
- # ========== HERRAMIENTAS CON INTEGRACIÓN NEBIUS ==========
199
 
200
- class NebiusEnhancedExtractionTool(Tool):
201
- name = "nebius_extract_references"
202
- description = """
203
- Extract bibliographic references using Nebius AI for enhanced accuracy.
204
-
205
- Args:
206
- text (str): Text to analyze
207
- nebius_api_key (str): Nebius API key
208
- use_ai_enhancement (bool): Whether to use Nebius AI for enhancement
209
-
210
- Returns:
211
- List[Dict]: Extracted references with Nebius AI analysis
212
- """
213
 
214
  def __init__(self):
215
- super().__init__()
216
- # Patrones básicos para extracción inicial
217
  self.patterns = {
218
- ResourceType.DOI: [
219
  r'\b10\.\d{4,9}/[-._;()/:A-Z0-9]+\b',
220
  r'doi:\s*(10\.\d{4,9}/[-._;()/:A-Z0-9]+)',
 
221
  ],
222
- ResourceType.ISBN: [
223
- r'ISBN(?:-1[03])?:?\s*(?=[0-9X]{10})(?:97[89][- ]?)?[0-9]{1,5}[- ]?[0-9]+[- ]?[0-9]+[- ]?[0-9X]',
224
- ],
225
- ResourceType.ARXIV: [
226
  r'arXiv:\s*(\d{4}\.\d{4,5}(v\d+)?)',
227
- r'arxiv:\s*([a-z\-]+/\d{7})'
 
 
 
 
 
228
  ],
 
 
 
 
 
 
 
229
  }
230
 
231
- def forward(self, text: str, nebius_api_key: str = None,
232
- use_ai_enhancement: bool = False) -> List[Dict[str, Any]]:
233
- """Extraer referencias con opción de mejora con Nebius"""
234
- # Extracción básica
235
- basic_references = self._extract_basic(text)
236
-
237
- if not use_ai_enhancement or not nebius_api_key:
238
- return basic_references
239
-
240
- # Mejora con Nebius AI
241
- try:
242
- nebius = NebiusAPI(nebius_api_key)
243
-
244
- # Usar asyncio en contexto síncrono
245
- import nest_asyncio
246
- nest_asyncio.apply()
247
-
248
- # Extraer con Nebius
249
- loop = asyncio.new_event_loop()
250
- asyncio.set_event_loop(loop)
251
- nebius_references = loop.run_until_complete(
252
- nebius.extract_references(text[:10000]) # Limitar para API
253
- )
254
- loop.close()
255
-
256
- # Combinar resultados
257
- enhanced_references = self._merge_references(basic_references, nebius_references)
258
- return enhanced_references
259
-
260
- except Exception as e:
261
- logger.error(f"Error using Nebius enhancement: {e}")
262
- return basic_references
263
-
264
- def _extract_basic(self, text: str) -> List[Dict[str, Any]]:
265
- """Extracción básica de referencias"""
266
- references = []
267
 
268
- for resource_type, patterns in self.patterns.items():
 
269
  for pattern in patterns:
270
- matches = re.finditer(pattern, text, re.IGNORECASE)
271
- for match in matches:
272
- identifier = match.group(1) if match.groups() else match.group(0)
273
- identifier = self._clean_identifier(identifier, resource_type)
274
-
275
- if identifier:
276
- reference = {
277
- "id": hashlib.md5(identifier.encode()).hexdigest()[:12],
278
- "raw_text": match.group(0),
279
- "type": resource_type.value,
280
- "identifier": identifier,
281
- "confidence": 0.8,
282
- "context": self._get_context(text, match.start(), match.end()),
283
- "position": (match.start(), match.end()),
284
- "extraction_method": "regex"
285
- }
286
- references.append(reference)
287
-
288
- return references
289
-
290
- def _merge_references(self, basic: List[Dict], nebius: List[Dict]) -> List[Dict]:
291
- """Combinar referencias de extracción básica y Nebius"""
292
- merged = basic.copy()
293
-
294
- for nebius_ref in nebius:
295
- # Verificar si ya existe
296
- exists = False
297
- for ref in merged:
298
- if ref.get('identifier') == nebius_ref.get('identifier'):
299
- exists = True
300
- # Actualizar confianza y metadata
301
- ref['confidence'] = max(ref.get('confidence', 0),
302
- nebius_ref.get('confidence', 0))
303
- ref['extraction_method'] = 'regex+nebius'
304
- break
305
-
306
- if not exists:
307
- # Convertir formato Nebius a nuestro formato
308
- new_ref = {
309
- "id": hashlib.md5(
310
- nebius_ref.get('identifier', '').encode()
311
- ).hexdigest()[:12],
312
- "raw_text": nebius_ref.get('raw_text', ''),
313
- "type": nebius_ref.get('type', 'unknown'),
314
- "identifier": nebius_ref.get('identifier', ''),
315
- "confidence": nebius_ref.get('confidence', 0.7),
316
- "context": nebius_ref.get('context', ''),
317
- "position": (0, 0),
318
- "extraction_method": 'nebius'
319
- }
320
- merged.append(new_ref)
321
-
322
- return merged
323
-
324
- def _clean_identifier(self, identifier: str, resource_type: ResourceType) -> str:
325
- """Limpiar identificador"""
326
  identifier = identifier.strip()
327
 
 
328
  prefixes = ['doi:', 'DOI:', 'arxiv:', 'arXiv:', 'isbn:', 'ISBN:', 'pmid:', 'PMID:']
329
  for prefix in prefixes:
330
  if identifier.startswith(prefix):
331
  identifier = identifier[len(prefix):].strip()
332
 
 
333
  identifier = identifier.strip('"\'<>()[]{}')
334
 
335
- if resource_type == ResourceType.URL:
336
- if not identifier.startswith(('http://', 'https://')):
337
- identifier = f'https://{identifier}'
338
 
339
  return identifier
340
-
341
- def _get_context(self, text: str, start: int, end: int, window: int = 100) -> str:
342
- """Obtener contexto alrededor del match"""
343
- context_start = max(0, start - window)
344
- context_end = min(len(text), end + window)
345
- return text[context_start:context_end]
346
 
347
- class NebiusVerificationTool(Tool):
348
- name = "nebius_verify_reference"
349
- description = """
350
- Verify academic references using Nebius AI analysis.
351
-
352
- Args:
353
- reference (Dict): Reference to verify
354
- nebius_api_key (str): Nebius API key
355
- deep_verify (bool): Whether to perform deep verification
356
-
357
- Returns:
358
- Dict: Verification results with Nebius analysis
359
- """
360
 
361
  def __init__(self):
362
- super().__init__()
363
  self.headers = {
364
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
365
  }
366
 
367
- def forward(self, reference: Dict[str, Any], nebius_api_key: str = None,
368
- deep_verify: bool = False) -> Dict[str, Any]:
369
- """Verificar referencia con Nebius"""
 
370
  result = {
371
- "reference": reference,
 
372
  "verified": False,
373
- "verification_source": "direct",
374
  "download_url": None,
375
- "file_format": None,
376
- "file_size": None,
377
- "quality_score": 0.0,
378
- "notes": [],
379
- "nebius_analysis": None
380
  }
381
 
382
- # Verificación directa primero
383
- direct_result = self._direct_verification(reference)
384
- if direct_result.get("verified"):
385
- result.update(direct_result)
386
- result["quality_score"] = 0.9
387
-
388
- # Verificación con Nebius si está disponible
389
- if nebius_api_key and deep_verify:
390
- nebius_result = self._nebius_verification(reference, nebius_api_key)
391
- result["nebius_analysis"] = nebius_result
392
-
393
- if nebius_result.get("valid", False):
394
- result["verified"] = True
395
- result["verification_source"] = "nebius"
396
- result["quality_score"] = max(
397
- result.get("quality_score", 0),
398
- nebius_result.get("confidence", 0)
399
- )
400
-
401
- # Agregar URLs sugeridas por Nebius
402
- suggested_urls = nebius_result.get("suggested_urls", [])
403
- if suggested_urls and not result.get("download_url"):
404
- result["download_url"] = suggested_urls[0]
405
-
406
- result["notes"].append(
407
- f"Nebius analysis: {nebius_result.get('notes', 'No notes')}"
408
- )
409
-
410
- return result
411
-
412
- def _direct_verification(self, reference: Dict[str, Any]) -> Dict[str, Any]:
413
- """Verificación directa de la referencia"""
414
- import requests
415
-
416
- ref_type = reference.get("type", "")
417
- identifier = reference.get("identifier", "")
418
-
419
- try:
420
- if ref_type == "doi":
421
- return self._verify_doi(identifier)
422
- elif ref_type == "arxiv":
423
- return self._verify_arxiv(identifier)
424
- elif ref_type == "url":
425
- return self._verify_url(identifier)
426
- elif ref_type == "isbn":
427
- return self._verify_isbn(identifier)
428
- except Exception as e:
429
- logger.error(f"Direct verification error: {e}")
430
-
431
- return {"verified": False, "notes": [f"Direct verification failed for {ref_type}"]}
432
-
433
- def _verify_doi(self, doi: str) -> Dict[str, Any]:
434
- """Verificar DOI"""
435
- import requests
436
-
437
  try:
438
- # Crossref
439
  url = f"https://api.crossref.org/works/{doi}"
440
  response = requests.get(url, headers=self.headers, timeout=10)
441
 
@@ -443,976 +260,537 @@ class NebiusVerificationTool(Tool):
443
  data = response.json()
444
  work = data.get('message', {})
445
 
446
- result = {"verified": True, "notes": ["Verified via Crossref"]}
 
 
 
 
 
 
 
447
 
448
  # Buscar PDF
449
  links = work.get('link', [])
450
  for link in links:
451
  if link.get('content-type') == 'application/pdf':
452
  result["download_url"] = link.get('URL')
453
- result["file_format"] = "pdf"
454
  break
455
 
456
- return result
 
 
 
 
 
 
 
 
 
 
 
457
  except Exception as e:
458
- logger.error(f"DOI verification error: {e}")
459
 
460
- return {"verified": False}
461
 
462
- def _verify_arxiv(self, arxiv_id: str) -> Dict[str, Any]:
463
- """Verificar arXiv ID"""
464
  import requests
465
 
 
 
 
 
 
 
 
 
 
466
  try:
467
  # Limpiar ID
468
  if 'arxiv:' in arxiv_id.lower():
469
  arxiv_id = arxiv_id.split(':')[-1].strip()
470
 
471
- # Verificar existencia
472
  api_url = f"http://export.arxiv.org/api/query?id_list={arxiv_id}"
473
  response = requests.get(api_url, headers=self.headers, timeout=10)
474
 
475
  if response.status_code == 200:
476
- return {
477
- "verified": True,
478
- "download_url": f"https://arxiv.org/pdf/{arxiv_id}.pdf",
479
- "file_format": "pdf",
480
- "notes": ["arXiv paper available"]
481
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
482
  except Exception as e:
483
- logger.error(f"arXiv verification error: {e}")
484
 
485
- return {"verified": False}
486
 
487
- def _verify_url(self, url: str) -> Dict[str, Any]:
488
- """Verificar URL"""
489
  import requests
 
490
 
491
  try:
492
- response = requests.head(url, headers=self.headers, timeout=10, allow_redirects=True)
493
 
494
  if response.status_code == 200:
495
- result = {"verified": True, "notes": [f"URL accessible: {response.status_code}"]}
 
496
 
497
- # Verificar si es PDF
498
  content_type = response.headers.get('content-type', '')
499
  if 'application/pdf' in content_type:
500
- result["download_url"] = url
501
- result["file_format"] = "pdf"
 
 
 
 
 
 
 
 
 
 
 
 
502
 
503
- return result
504
- except Exception as e:
505
- logger.error(f"URL verification error: {e}")
506
-
507
- return {"verified": False}
508
-
509
- def _verify_isbn(self, isbn: str) -> Dict[str, Any]:
510
- """Verificar ISBN"""
511
- import requests
512
-
513
- try:
514
- # Open Library
515
- url = f"https://openlibrary.org/api/books?bibkeys=ISBN:{isbn}&format=json"
516
- response = requests.get(url, headers=self.headers, timeout=10)
517
-
518
- if response.status_code == 200:
519
- data = response.json()
520
- if data:
521
- return {
522
- "verified": True,
523
- "notes": ["ISBN found in Open Library"]
524
- }
525
  except Exception as e:
526
- logger.error(f"ISBN verification error: {e}")
527
 
528
- return {"verified": False}
529
-
530
- def _nebius_verification(self, reference: Dict[str, Any], api_key: str) -> Dict[str, Any]:
531
- """Verificación con Nebius AI"""
532
- try:
533
- nebius = NebiusAPI(api_key)
534
-
535
- # Usar asyncio en contexto síncrono
536
- import nest_asyncio
537
- nest_asyncio.apply()
538
-
539
- loop = asyncio.new_event_loop()
540
- asyncio.set_event_loop(loop)
541
- analysis = loop.run_until_complete(
542
- nebius.verify_reference(reference)
543
- )
544
- loop.close()
545
-
546
- return analysis
547
-
548
- except Exception as e:
549
- logger.error(f"Nebius verification error: {e}")
550
- return {"valid": False, "confidence": 0.0, "notes": f"Error: {str(e)}"}
551
 
552
- # ========== SISTEMA PRINCIPAL CON NEBIUS ==========
553
 
554
- class NebiusBibliographySystem:
555
- """Sistema de procesamiento bibliográfico con Nebius AI"""
556
 
557
- def __init__(self, config: Dict[str, Any]):
558
- self.config = config
559
- self.nebius_api_key = config.get("nebius_api_key")
560
- self.use_nebius = bool(self.nebius_api_key)
561
-
562
- # Inicializar herramientas
563
- self.extraction_tool = NebiusEnhancedExtractionTool()
564
- self.verification_tool = NebiusVerificationTool()
565
-
566
- # Configurar modelo LiteLLM para agentes
567
- self.llm_model = self._configure_llm()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
568
 
569
- # Directorios de salida
570
- self.output_base = "nebius_bibliography"
571
- self.download_dir = os.path.join(self.output_base, "downloads")
572
- self.report_dir = os.path.join(self.output_base, "reports")
573
- self.log_dir = os.path.join(self.output_base, "logs")
 
 
 
 
 
574
 
575
- # Crear directorios
576
- for dir_path in [self.output_base, self.download_dir, self.report_dir, self.log_dir]:
577
- os.makedirs(dir_path, exist_ok=True)
 
 
 
 
 
578
 
579
- # Estadísticas
580
- self.stats = {
581
- "total_processed": 0,
582
- "total_references": 0,
583
- "nebius_calls": 0,
584
- "success_rate": 0.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
585
  }
586
 
587
- logger.info(f"Nebius system initialized. Nebius AI: {'Enabled' if self.use_nebius else 'Disabled'}")
588
-
589
- def _configure_llm(self):
590
- """Configurar modelo LiteLLM"""
591
- provider = self.config.get("llm_provider", "openai")
592
 
593
- if provider == "nebius" and self.nebius_api_key:
594
- # Configurar Nebius como proveedor personalizado
595
- return LiteLLMModel(
596
- model_id=self.config.get("llm_model", "neural-chat-7b-v3-1"),
597
- api_key=self.nebius_api_key,
598
- api_base=self.config.get("nebius_api_base", "https://api.studio.nebius.com/v1")
599
- )
600
- elif provider == "openai":
601
- return LiteLLMModel(
602
- model_id=self.config.get("llm_model", "gpt-4"),
603
- api_key=self.config.get("openai_api_key")
604
- )
605
- else:
606
- # Default to Nebius if available
607
- if self.nebius_api_key:
608
- return LiteLLMModel(
609
- model_id="neural-chat-7b-v3-1",
610
- api_key=self.nebius_api_key,
611
- api_base="https://api.studio.nebius.com/v1"
612
- )
613
- else:
614
- return LiteLLMModel(model_id="gpt-4")
615
-
616
- async def process_document(self, file_path: str, process_id: str = None) -> Dict[str, Any]:
617
- """Procesar documento completo con Nebius"""
618
- import time
619
- start_time = time.time()
620
-
621
- # Generar ID de proceso
622
- process_id = process_id or self._generate_process_id(file_path)
623
 
624
- logger.info(f"[{process_id}] Processing document: {file_path}")
 
625
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
626
  try:
627
- # 1. Leer archivo
628
- file_content = self._read_file(file_path)
629
- if not file_content:
630
- return self._error_result(process_id, "Empty or unreadable file")
631
-
632
- # 2. Extraer referencias
633
- logger.info(f"[{process_id}] Extracting references...")
634
- references = self.extraction_tool.forward(
635
- text=file_content,
636
- nebius_api_key=self.nebius_api_key,
637
- use_ai_enhancement=self.use_nebius
638
- )
639
-
640
- if self.use_nebius:
641
- self.stats["nebius_calls"] += 1
642
-
643
- self.stats["total_references"] += len(references)
644
- logger.info(f"[{process_id}] Found {len(references)} references")
645
-
646
- # 3. Verificar referencias
647
- logger.info(f"[{process_id}] Verifying references...")
648
- verification_results = []
649
- failed_verifications = []
650
-
651
- for i, ref in enumerate(references):
652
- if i % 5 == 0: # Log cada 5 referencias
653
- logger.info(f"[{process_id}] Verified {i}/{len(references)}")
654
-
655
- # Verificar referencia
656
- verification = self.verification_tool.forward(
657
- reference=ref,
658
- nebius_api_key=self.nebius_api_key,
659
- deep_verify=self.use_nebius
660
- )
661
-
662
- if verification.get("verified"):
663
- # Convertir a modelo
664
- citation = CitationModel(
665
- id=ref.get("id"),
666
- raw_text=ref.get("raw_text", ""),
667
- resource_type=ResourceType(ref.get("type", "unknown")),
668
- identifier=ref.get("identifier", ""),
669
- confidence=ref.get("confidence", 0.0),
670
- extracted_from=file_path,
671
- position=ref.get("position", (0, 0)),
672
- nebius_verified=self.use_nebius,
673
- nebius_confidence=verification.get("quality_score", 0.0)
674
- )
675
-
676
- vr = VerificationResult(
677
- citation=citation,
678
- verified=True,
679
- verification_source=verification.get("verification_source", "unknown"),
680
- download_url=verification.get("download_url"),
681
- file_format=verification.get("file_format"),
682
- file_size=verification.get("file_size"),
683
- quality_score=verification.get("quality_score", 0.0),
684
- notes=verification.get("notes", []),
685
- nebius_analysis=verification.get("nebius_analysis")
686
- )
687
- verification_results.append(vr)
688
- else:
689
- # Referencia fallida
690
- citation = CitationModel(
691
- id=ref.get("id"),
692
- raw_text=ref.get("raw_text", ""),
693
- resource_type=ResourceType(ref.get("type", "unknown")),
694
- identifier=ref.get("identifier", ""),
695
- confidence=ref.get("confidence", 0.0),
696
- extracted_from=file_path,
697
- position=ref.get("position", (0, 0)),
698
- nebius_verified=False,
699
- nebius_confidence=0.0
700
- )
701
- failed_verifications.append(citation)
702
-
703
- # 4. Descargar archivos verificados
704
- logger.info(f"[{process_id}] Downloading files...")
705
- downloaded_files = await self._download_files(
706
- verification_results,
707
- process_id
708
- )
709
-
710
- # 5. Generar reporte
711
- processing_time = time.time() - start_time
712
 
713
- report = ProcessingReport(
714
- input_file=file_path,
715
- total_citations=len(references),
716
- verified_resources=verification_results,
717
- downloaded_files=downloaded_files,
718
- failed_verifications=failed_verifications,
719
- processing_time=processing_time,
720
- summary={
721
- "success_rate": len(verification_results) / max(1, len(references)),
722
- "download_rate": len(downloaded_files) / max(1, len(verification_results)),
723
- "avg_quality": sum(vr.quality_score for vr in verification_results) / max(1, len(verification_results))
724
- },
725
- nebius_usage={
726
- "enabled": self.use_nebius,
727
- "calls": self.stats["nebius_calls"],
728
- "enhanced_references": sum(1 for vr in verification_results if vr.nebius_analysis)
729
- }
730
  )
731
 
732
- # 6. Guardar resultados
733
- self._save_results(report, process_id)
734
-
735
- self.stats["total_processed"] += 1
736
- self.stats["success_rate"] = report.summary.get("success_rate", 0.0)
737
-
738
- logger.info(f"[{process_id}] Processing completed in {processing_time:.2f}s")
739
-
740
- return {
741
- "success": True,
742
- "process_id": process_id,
743
- "report": report.dict(),
744
- "zip_path": self._create_zip(report, process_id),
745
- "summary": {
746
- "references_found": len(references),
747
- "verified": len(verification_results),
748
- "downloaded": len(downloaded_files),
749
- "success_rate": f"{report.summary.get('success_rate', 0) * 100:.1f}%",
750
- "processing_time": f"{processing_time:.2f}s"
751
- }
752
- }
753
-
754
- except Exception as e:
755
- logger.error(f"[{process_id}] Processing error: {e}")
756
- return self._error_result(process_id, str(e))
757
-
758
- def _read_file(self, file_path: str) -> str:
759
- """Leer contenido del archivo"""
760
- try:
761
- with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
762
- return f.read()
763
- except Exception as e:
764
- logger.error(f"Error reading file {file_path}: {e}")
765
- return ""
766
-
767
- async def _download_files(self, verification_results: List[VerificationResult],
768
- process_id: str) -> List[str]:
769
- """Descargar archivos de URLs verificadas"""
770
- downloaded_files = []
771
-
772
- for i, vr in enumerate(verification_results):
773
- if vr.download_url:
774
  try:
775
- file_path = await self._download_file(
776
- vr.download_url,
777
- vr.citation.identifier,
778
- process_id,
779
- i
780
- )
781
- if file_path:
782
- downloaded_files.append(file_path)
783
- except Exception as e:
784
- logger.error(f"Download failed for {vr.citation.identifier}: {e}")
785
-
786
- return downloaded_files
787
-
788
- async def _download_file(self, url: str, identifier: str,
789
- process_id: str, index: int) -> Optional[str]:
790
- """Descargar un archivo individual"""
791
- import aiohttp
792
 
793
- try:
794
- # Crear nombre de archivo seguro
795
- safe_name = re.sub(r'[^\w\-\.]', '_', identifier)
796
- if len(safe_name) > 100:
797
- safe_name = safe_name[:100]
798
-
799
- # Determinar extensión
800
- extension = self._get_extension_from_url(url)
801
- if not extension:
802
- extension = ".pdf" # Default
803
-
804
- filename = f"{process_id}_{index:03d}_{safe_name}{extension}"
805
- filepath = os.path.join(self.download_dir, filename)
806
-
807
- # Descargar
808
- timeout = aiohttp.ClientTimeout(total=60)
809
- async with aiohttp.ClientSession(timeout=timeout) as session:
810
- async with session.get(url, headers={'User-Agent': 'Mozilla/5.0'}) as response:
811
- if response.status == 200:
812
- content = await response.read()
813
-
814
- # Verificar que sea un archivo válido
815
- if len(content) > 100: # Archivo no vacío
816
- with open(filepath, 'wb') as f:
817
- f.write(content)
818
-
819
- logger.info(f"Downloaded: {filename} ({len(content)} bytes)")
820
- return filepath
821
-
822
- return None
823
-
824
  except Exception as e:
825
- logger.error(f"Download error for {url}: {e}")
826
- return None
827
-
828
- def _get_extension_from_url(self, url: str) -> str:
829
- """Obtener extensión de archivo desde URL"""
830
- url_lower = url.lower()
831
 
832
- if '.pdf' in url_lower:
833
- return '.pdf'
834
- elif '.docx' in url_lower or '.doc' in url_lower:
835
- return '.docx'
836
- elif '.html' in url_lower or '.htm' in url_lower:
837
- return '.html'
838
- elif '.txt' in url_lower:
839
- return '.txt'
840
- elif '.epub' in url_lower:
841
- return '.epub'
842
-
843
- return ""
844
-
845
- def _generate_process_id(self, file_path: str) -> str:
846
- """Generar ID único de proceso"""
847
- timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
848
- file_hash = hashlib.md5(file_path.encode()).hexdigest()[:6]
849
- return f"NB_{timestamp}_{file_hash}"
850
-
851
- def _save_results(self, report: ProcessingReport, process_id: str):
852
- """Guardar resultados en disco"""
853
- # Guardar reporte JSON
854
- report_path = os.path.join(self.report_dir, f"{process_id}_report.json")
855
- with open(report_path, 'w', encoding='utf-8') as f:
856
- json.dump(report.dict(), f, indent=2, default=str)
857
-
858
- # Guardar resumen en texto
859
- summary_path = os.path.join(self.report_dir, f"{process_id}_summary.txt")
860
- with open(summary_path, 'w', encoding='utf-8') as f:
861
- f.write(self._generate_text_summary(report))
862
 
863
- def _create_zip(self, report: ProcessingReport, process_id: str) -> str:
864
- """Crear archivo ZIP con resultados"""
865
  import zipfile
 
866
 
867
- zip_path = os.path.join(self.output_base, f"{process_id}_results.zip")
 
868
 
869
- with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
870
- # Agregar reportes
871
- report_files = [
872
- f for f in os.listdir(self.report_dir)
873
- if f.startswith(process_id)
874
- ]
875
-
876
- for file in report_files:
877
- filepath = os.path.join(self.report_dir, file)
878
- zipf.write(filepath, f"reports/{file}")
879
 
880
  # Agregar archivos descargados
881
- for file_path in report.downloaded_files:
882
  if os.path.exists(file_path):
883
- filename = os.path.basename(file_path)
884
- zipf.write(file_path, f"downloads/{filename}")
885
 
886
- # Agregar log
887
- log_path = os.path.join(self.log_dir, f"{process_id}_log.txt")
888
- with open(log_path, 'w') as f:
889
- f.write(f"Process ID: {process_id}\n")
890
- f.write(f"Time: {datetime.now().isoformat()}\n")
891
- f.write(f"Success rate: {report.summary.get('success_rate', 0) * 100:.1f}%\n")
892
-
893
- zipf.write(log_path, "process_log.txt")
894
 
895
- return zip_path
896
 
897
- def _generate_text_summary(self, report: ProcessingReport) -> str:
898
- """Generar resumen en texto"""
899
- summary = f"""
900
- NEBIUS BIBLIOGRAPHY PROCESSING REPORT
901
- =====================================
902
-
903
- Process ID: Generated automatically
904
- Input File: {report.input_file}
905
- Processing Time: {report.processing_time:.2f} seconds
906
- Timestamp: {report.timestamp}
907
-
908
- SUMMARY STATISTICS
909
- ------------------
910
- Total References Found: {report.total_citations}
911
- Successfully Verified: {len(report.verified_resources)}
912
- Files Downloaded: {len(report.downloaded_files)}
913
- Verification Success Rate: {report.summary.get('success_rate', 0) * 100:.1f}%
914
- Average Quality Score: {report.summary.get('avg_quality', 0):.2f}
915
 
916
- NEBIUS AI USAGE
917
- ---------------
918
- Enabled: {report.nebius_usage.get('enabled', False)}
919
- API Calls: {report.nebius_usage.get('calls', 0)}
920
- Enhanced References: {report.nebius_usage.get('enhanced_references', 0)}
921
 
922
- VERIFIED RESOURCES (Top 10)
923
- ---------------------------
 
 
 
 
 
 
 
 
 
 
 
924
  """
925
-
926
- for i, vr in enumerate(report.verified_resources[:10], 1):
927
- summary += f"\n{i}. {vr.citation.identifier}"
928
- summary += f"\n Type: {vr.citation.resource_type.value}"
929
- summary += f"\n Source: {vr.verification_source}"
930
- summary += f"\n Quality: {vr.quality_score:.2f}"
931
- summary += f"\n Nebius Enhanced: {vr.citation.nebius_verified}"
932
- if vr.download_url:
933
- summary += f"\n Downloaded: Yes"
934
- summary += "\n"
935
-
936
- if report.failed_verifications:
937
- summary += f"\nFAILED VERIFICATIONS ({len(report.failed_verifications)})\n"
938
- summary += "-" * 40 + "\n"
939
- for citation in report.failed_verifications[:5]:
940
- summary += f"- {citation.identifier} ({citation.resource_type.value})\n"
941
-
942
- summary += f"\nFILES DOWNLOADED\n"
943
- summary += "-" * 40 + "\n"
944
- for file_path in report.downloaded_files:
945
- if os.path.exists(file_path):
946
- file_size = os.path.getsize(file_path)
947
- summary += f"- {os.path.basename(file_path)} ({file_size} bytes)\n"
948
-
949
- return summary
950
-
951
- def _error_result(self, process_id: str, error: str) -> Dict[str, Any]:
952
- """Generar resultado de error"""
953
- return {
954
- "success": False,
955
- "process_id": process_id,
956
- "error": error,
957
- "timestamp": datetime.now().isoformat()
958
- }
959
-
960
- def get_stats(self) -> Dict[str, Any]:
961
- """Obtener estadísticas del sistema"""
962
- return {
963
- "total_processed": self.stats["total_processed"],
964
- "total_references": self.stats["total_references"],
965
- "nebius_calls": self.stats["nebius_calls"],
966
- "success_rate": self.stats["success_rate"],
967
- "output_directory": self.output_base
968
- }
969
 
970
- # ========== INTERFAZ GRADIO MEJORADA ==========
971
 
972
- def create_nebius_interface():
973
- """Crear interfaz Gradio con soporte para Nebius"""
974
 
975
- system = None
976
- current_process = None
977
 
978
- def initialize_system(provider, model, nebius_key, nebius_base, openai_key):
979
- """Inicializar sistema con configuración"""
980
- nonlocal system
981
-
982
- config = {
983
- "llm_provider": provider,
984
- "llm_model": model,
985
- "nebius_api_key": nebius_key,
986
- "nebius_api_base": nebius_base or "https://api.studio.nebius.com/v1",
987
- "openai_api_key": openai_key,
988
- "use_nebius": bool(nebius_key)
989
- }
990
 
991
  try:
992
- system = NebiusBibliographySystem(config)
993
- return "✅ Sistema inicializado con Nebius AI" if nebius_key else "✅ Sistema inicializado (sin Nebius)"
994
- except Exception as e:
995
- return f"❌ Error: {str(e)}"
996
-
997
- async def process_document(file_obj, use_nebius, progress=gr.Progress()):
998
- """Procesar documento"""
999
- nonlocal system, current_process
1000
-
1001
- if not system:
1002
- return None, "❌ Sistema no inicializado", "", "", ""
1003
-
1004
- try:
1005
- progress(0, desc="Preparando archivo...")
1006
-
1007
- # Guardar archivo temporalmente
1008
- import tempfile
1009
- import shutil
1010
-
1011
- temp_dir = tempfile.mkdtemp()
1012
- file_path = os.path.join(temp_dir, file_obj.name)
1013
- shutil.copy(file_obj.name, file_path)
1014
-
1015
- progress(0.1, desc="Procesando con Nebius..." if use_nebius else "Procesando...")
1016
-
1017
- # Procesar documento
1018
- result = await system.process_document(file_path)
1019
-
1020
- if not result.get("success"):
1021
- # Limpiar temporal
1022
- shutil.rmtree(temp_dir, ignore_errors=True)
1023
- return None, f"❌ Error: {result.get('error')}", "", "", ""
1024
-
1025
- current_process = result.get("process_id")
1026
- summary = result.get("summary", {})
1027
-
1028
- progress(0.9, desc="Generando reportes...")
1029
-
1030
- # Generar visualizaciones
1031
- report_data = result.get("report", {})
1032
-
1033
- # HTML output
1034
- html_output = self._generate_html_report(report_data)
1035
-
1036
- # Text output
1037
- text_output = self._generate_text_report(report_data)
1038
-
1039
- # JSON output
1040
- json_output = json.dumps(report_data, indent=2, default=str)
1041
-
1042
- # Statistics
1043
- stats_output = self._generate_stats_display(summary)
1044
-
1045
- progress(1.0, desc="Completado!")
1046
-
1047
- # Limpiar temporal
1048
- shutil.rmtree(temp_dir, ignore_errors=True)
1049
-
1050
- return (
1051
- result.get("zip_path"),
1052
- f"✅ Proceso {current_process} completado",
1053
- html_output,
1054
- text_output,
1055
- json_output,
1056
- stats_output
1057
  )
1058
 
1059
- except Exception as e:
1060
- logger.error(f"Processing error: {e}")
1061
- return None, f"❌ Error: {str(e)}", "", "", "", ""
1062
-
1063
- def _generate_html_report(self, report_data: Dict) -> str:
1064
- """Generar reporte HTML"""
1065
- verified = len(report_data.get("verified_resources", []))
1066
- total = report_data.get("total_citations", 0)
1067
- success_rate = (verified / max(1, total)) * 100
1068
-
1069
- nebius_usage = report_data.get("nebius_usage", {})
1070
-
1071
- html = f"""
1072
- <div style="font-family: Arial, sans-serif; padding: 20px;">
1073
- <h2 style="color: #2c3e50;">📊 Reporte de Procesamiento Nebius</h2>
1074
-
1075
- <div style="background: #ecf0f1; padding: 15px; border-radius: 10px; margin: 15px 0;">
1076
- <h3 style="color: #34495e;">📈 Resumen General</h3>
1077
- <div style="display: grid; grid-template-columns: repeat(2, 1fr); gap: 10px;">
1078
- <div style="background: white; padding: 10px; border-radius: 5px;">
1079
- <strong>Referencias Encontradas</strong><br>
1080
- <span style="font-size: 24px; color: #3498db;">{total}</span>
1081
- </div>
1082
- <div style="background: white; padding: 10px; border-radius: 5px;">
1083
- <strong>Verificadas</strong><br>
1084
- <span style="font-size: 24px; color: #2ecc71;">{verified}</span>
1085
- </div>
1086
- <div style="background: white; padding: 10px; border-radius: 5px;">
1087
- <strong>Tasa de Éxito</strong><br>
1088
- <span style="font-size: 24px; color: #9b59b6;">{success_rate:.1f}%</span>
1089
- </div>
1090
- <div style="background: white; padding: 10px; border-radius: 5px;">
1091
- <strong>Tiempo</strong><br>
1092
- <span style="font-size: 24px; color: #e74c3c;">{report_data.get('processing_time', 0):.1f}s</span>
1093
  </div>
1094
  </div>
1095
- </div>
1096
-
1097
- <div style="background: #d5f4e6; padding: 15px; border-radius: 10px; margin: 15px 0;">
1098
- <h3 style="color: #27ae60;">🤖 Nebius AI</h3>
1099
- <p><strong>Estado:</strong> {'✅ Activado' if nebius_usage.get('enabled') else '❌ Desactivado'}</p>
1100
- <p><strong>Llamadas API:</strong> {nebius_usage.get('calls', 0)}</p>
1101
- <p><strong>Referencias Mejoradas:</strong> {nebius_usage.get('enhanced_references', 0)}</p>
1102
- </div>
1103
-
1104
- <div style="background: #e8f4fc; padding: 15px; border-radius: 10px; margin: 15px 0;">
1105
- <h3 style="color: #2980b9;">📥 Descargas</h3>
1106
- <p><strong>Archivos Descargados:</strong> {len(report_data.get('downloaded_files', []))}</p>
1107
- <ul>
1108
- """
1109
-
1110
- for file in report_data.get("downloaded_files", [])[:5]:
1111
- filename = os.path.basename(file)
1112
- html += f'<li>{filename}</li>'
1113
-
1114
- html += """
1115
- </ul>
1116
- </div>
1117
 
1118
- <div style="background: #fdebd0; padding: 15px; border-radius: 10px; margin: 15px 0;">
1119
- <h3 style="color: #d35400;">⚠️ Referencias No Verificadas</h3>
1120
- <p><strong>Total:</strong> {failed}</p>
1121
- """.format(failed=len(report_data.get("failed_verifications", [])))
1122
-
1123
- html += """
1124
- </div>
1125
- </div>
1126
- """
1127
-
1128
- return html
1129
-
1130
- def _generate_text_report(self, report_data: Dict) -> str:
1131
- """Generar reporte en texto"""
1132
- verified = len(report_data.get("verified_resources", []))
1133
- total = report_data.get("total_citations", 0)
1134
-
1135
- text = f"""
1136
- REPORTE DE PROCESAMIENTO
1137
- ========================
1138
-
1139
- Archivo: {report_data.get('input_file', 'Desconocido')}
1140
- Fecha: {report_data.get('timestamp', '')}
1141
-
1142
- ESTADÍSTICAS:
1143
- -------------
1144
- • Referencias encontradas: {total}
1145
- • Referencias verificadas: {verified}
1146
- • Archivos descargados: {len(report_data.get('downloaded_files', []))}
1147
- • Tiempo de procesamiento: {report_data.get('processing_time', 0):.2f}s
1148
- • Tasa de éxito: {(verified/max(1, total))*100:.1f}%
1149
-
1150
- NEBIUS AI:
1151
- ----------
1152
- • Estado: {'Activado' if report_data.get('nebius_usage', {}).get('enabled') else 'Desactivado'}
1153
- • Llamadas API: {report_data.get('nebius_usage', {}).get('calls', 0)}
1154
- • Referencias mejoradas: {report_data.get('nebius_usage', {}).get('enhanced_references', 0)}
1155
-
1156
- Para más detalles, consulte el archivo ZIP con el reporte completo.
1157
- """
1158
-
1159
- return text
1160
-
1161
- def _generate_stats_display(self, summary: Dict) -> str:
1162
- """Generar display de estadísticas"""
1163
- return f"""
1164
- ⚡ PROCESO COMPLETADO ⚡
1165
-
1166
- 📊 Estadísticas Rápidas:
1167
- • Referencias: {summary.get('references_found', 0)}
1168
- • Verificadas: {summary.get('verified', 0)}
1169
- • Descargadas: {summary.get('downloaded', 0)}
1170
- • Tasa de éxito: {summary.get('success_rate', '0%')}
1171
- • Tiempo: {summary.get('processing_time', '0s')}
1172
- """
1173
-
1174
- def get_system_stats():
1175
- """Obtener estadísticas del sistema"""
1176
- nonlocal system
1177
-
1178
- if not system:
1179
- return "❌ Sistema no inicializado"
1180
-
1181
- stats = system.get_stats()
1182
-
1183
- return f"""
1184
- 📈 Estadísticas del Sistema Nebius:
1185
-
1186
- • Documentos procesados: {stats.get('total_processed', 0)}
1187
- • Referencias totales: {stats.get('total_references', 0)}
1188
- • Llamadas Nebius API: {stats.get('nebius_calls', 0)}
1189
- • Tasa de éxito promedio: {stats.get('success_rate', 0) * 100:.1f}%
1190
- • Directorio de salida: {stats.get('output_directory', 'N/A')}
1191
- """
1192
 
1193
- # Crear interfaz
1194
- with gr.Blocks(title="Nebius Bibliography System", theme=gr.themes.Soft()) as interface:
1195
- gr.Markdown("# 📚 Sistema de Recopilación Bibliográfica con Nebius AI")
1196
- gr.Markdown("Procesa documentos académicos usando Nebius AI para extracción y verificación inteligente")
1197
 
1198
  with gr.Row():
1199
  with gr.Column(scale=1):
1200
- gr.Markdown("### ⚙️ Configuración Nebius AI")
1201
 
1202
- provider = gr.Dropdown(
1203
- choices=["nebius", "openai"],
1204
- label="Proveedor de IA Principal",
1205
- value="nebius",
1206
- info="Selecciona Nebius para usar la API de Nebius AI"
1207
  )
1208
 
1209
- model = gr.Textbox(
1210
- label="Modelo",
1211
- value="neural-chat-7b-v3-1",
1212
- placeholder="Modelo de Nebius (ej: neural-chat-7b-v3-1)"
1213
  )
1214
 
1215
- nebius_key = gr.Textbox(
1216
- label="Nebius API Key",
1217
  type="password",
1218
- placeholder="Ingresa tu API Key de Nebius"
1219
  )
1220
 
1221
- nebius_base = gr.Textbox(
1222
- label="Nebius API Base (opcional)",
1223
- value="https://api.studio.nebius.com/v1",
1224
- placeholder="URL base de la API de Nebius"
1225
  )
1226
 
1227
- openai_key = gr.Textbox(
1228
- label="OpenAI API Key (respaldo)",
1229
- type="password",
1230
- placeholder="Opcional: Key de OpenAI como respaldo"
1231
- )
1232
-
1233
- init_btn = gr.Button("🚀 Inicializar Sistema Nebius", variant="primary")
1234
- init_status = gr.Markdown("")
1235
-
1236
- gr.Markdown("---")
1237
- stats_btn = gr.Button("📊 Estadísticas del Sistema")
1238
- system_stats = gr.Markdown("")
1239
 
1240
  with gr.Column(scale=2):
1241
- gr.Markdown("### 📄 Procesar Documento")
1242
-
1243
- file_input = gr.File(
1244
- label="Sube tu documento",
1245
- file_types=[".txt", ".pdf", ".docx", ".html", ".md"]
1246
- )
1247
 
1248
- use_nebius = gr.Checkbox(
1249
- label="Usar Nebius AI para mejora de precisión",
1250
- value=True
 
 
1251
  )
1252
 
1253
- process_btn = gr.Button("🔍 Procesar con Nebius AI", variant="primary")
1254
 
1255
  gr.Markdown("### 📦 Resultados")
1256
 
1257
- result_file = gr.File(label="Descargar Paquete Completo (ZIP)")
1258
- result_status = gr.Markdown("")
1259
- stats_display = gr.Markdown("")
1260
 
1261
  with gr.Tabs():
1262
  with gr.TabItem("📋 Vista HTML"):
1263
- html_output = gr.HTML(label="Reporte Interactivo")
1264
 
1265
- with gr.TabItem("📝 Texto Plano"):
1266
  text_output = gr.Textbox(
1267
  label="Resumen",
1268
- lines=15,
1269
- max_lines=30
1270
  )
1271
 
1272
- with gr.TabItem("🔧 JSON Completo"):
1273
  json_output = gr.Code(
1274
- label="Datos Completos",
1275
  language="json",
1276
- lines=20
1277
  )
1278
 
1279
  # Conectar eventos
1280
- init_btn.click(
1281
- initialize_system,
1282
- inputs=[provider, model, nebius_key, nebius_base, openai_key],
1283
- outputs=init_status
1284
- )
1285
-
1286
  process_btn.click(
1287
- process_document,
1288
- inputs=[file_input, use_nebius],
1289
- outputs=[result_file, result_status, html_output, text_output, json_output, stats_display]
1290
  )
1291
 
1292
- stats_btn.click(
1293
- get_system_stats,
1294
- outputs=system_stats
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1295
  )
1296
-
1297
- # Información
1298
- gr.Markdown("""
1299
- ### 📌 Características Nebius AI
1300
-
1301
- **🔍 Extracción Inteligente:**
1302
- - Identificación contextual de referencias
1303
- - Corrección automática de identificadores
1304
- - Clasificación por tipo de recurso
1305
-
1306
- **✅ Verificación Avanzada:**
1307
- - Análisis de accesibilidad
1308
- - Detección de acceso abierto
1309
- - Sugerencias de fuentes alternativas
1310
-
1311
- **📊 Reportes Mejorados:**
1312
- - Métricas de confianza Nebius
1313
- - Análisis de calidad por referencia
1314
- - Estadísticas de uso de IA
1315
-
1316
- ### ⚠️ Notas Importantes
1317
-
1318
- 1. La API de Nebius requiere una key válida
1319
- 2. Los archivos grandes pueden consumir más tokens
1320
- 3. Se recomienda usar Nebius para máxima precisión
1321
- 4. Mantén tu API key segura y no la compartas
1322
-
1323
- ### 🔗 Recursos
1324
-
1325
- • [Documentación Nebius AI](https://docs.nebius.com)
1326
- • [Obtener API Key](https://studio.nebius.com)
1327
- • [Soporte Técnico](https://support.nebius.com)
1328
- """)
1329
 
1330
  return interface
1331
 
1332
  # ========== EJECUCIÓN PRINCIPAL ==========
1333
 
1334
- async def main():
1335
  """Función principal"""
1336
- import argparse
1337
-
1338
- parser = argparse.ArgumentParser(description="Sistema Nebius de Recopilación Bibliográfica")
1339
- parser.add_argument("--mode", choices=["gui", "cli"], default="gui",
1340
- help="Modo de ejecución")
1341
- parser.add_argument("--file", type=str, help="Archivo a procesar (modo CLI)")
1342
- parser.add_argument("--nebius-key", help="API Key de Nebius")
1343
- parser.add_argument("--model", default="neural-chat-7b-v3-1", help="Modelo Nebius")
1344
- parser.add_argument("--api-base", default="https://api.studio.nebius.com/v1",
1345
- help="URL base de Nebius API")
1346
-
1347
- args = parser.parse_args()
1348
-
1349
- if args.mode == "gui":
1350
- # Ejecutar interfaz Gradio
1351
- interface = create_nebius_interface()
1352
- interface.launch(
1353
- server_name="0.0.0.0",
1354
- server_port=7860,
1355
- share=True,
1356
- debug=True
1357
- )
1358
-
1359
- elif args.mode == "cli":
1360
- # Modo línea de comandos
1361
- if not args.file:
1362
- print("❌ Error: Debes especificar un archivo con --file")
1363
- return
1364
-
1365
- if not os.path.exists(args.file):
1366
- print(f"❌ Error: Archivo no encontrado: {args.file}")
1367
- return
1368
-
1369
- if not args.nebius_key:
1370
- print("⚠️ Advertencia: No se proporcionó API Key de Nebius")
1371
- use_nebius = False
1372
- nebius_key = None
1373
- else:
1374
- use_nebius = True
1375
- nebius_key = args.nebius_key
1376
-
1377
- # Configurar sistema
1378
- config = {
1379
- "llm_provider": "nebius" if use_nebius else "openai",
1380
- "llm_model": args.model,
1381
- "nebius_api_key": nebius_key,
1382
- "nebius_api_base": args.api_base,
1383
- "use_nebius": use_nebius
1384
- }
1385
-
1386
- system = NebiusBibliographySystem(config)
1387
-
1388
- print(f"🔍 Procesando archivo: {args.file}")
1389
- print(f"🤖 Nebius AI: {'Activado' if use_nebius else 'Desactivado'}")
1390
- print("⏳ Procesando...")
1391
-
1392
- result = await system.process_document(args.file)
1393
-
1394
- if result.get("success"):
1395
- print(f"✅ Procesamiento completado!")
1396
- print(f"📊 ID del proceso: {result.get('process_id')}")
1397
-
1398
- summary = result.get("summary", {})
1399
- print(f"""
1400
- 📈 Resultados:
1401
- - Referencias encontradas: {summary.get('references_found', 0)}
1402
- - Referencias verificadas: {summary.get('verified', 0)}
1403
- - Archivos descargados: {summary.get('downloaded', 0)}
1404
- - Tasa de éxito: {summary.get('success_rate', '0%')}
1405
- - Tiempo de procesamiento: {summary.get('processing_time', '0s')}
1406
-
1407
- 📦 Paquete de resultados: {result.get('zip_path')}
1408
-
1409
- 📊 Estadísticas Nebius:
1410
- - Llamadas API: {result.get('report', {}).get('nebius_usage', {}).get('calls', 0)}
1411
- - Referencias mejoradas: {result.get('report', {}).get('nebius_usage', {}).get('enhanced_references', 0)}
1412
- """)
1413
- else:
1414
- print(f"❌ Error: {result.get('error')}")
1415
 
1416
  if __name__ == "__main__":
1417
- import asyncio
1418
- asyncio.run(main())
 
4
  import logging
5
  import zipfile
6
  import asyncio
7
+ from typing import Dict, List, Optional, Any
 
 
 
8
  from datetime import datetime
9
  import gradio as gr
10
  from enum import Enum
11
  import hashlib
 
12
  import aiohttp
13
 
 
 
 
 
 
14
  # Configuración de logging
15
  logging.basicConfig(
16
  level=logging.INFO,
17
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
 
 
 
 
18
  )
19
  logger = logging.getLogger(__name__)
20
 
21
+ # ========== CONFIGURACIÓN DE APIs ==========
22
 
23
+ class APIProvider:
24
+ """Gestor de diferentes APIs de IA"""
25
 
26
+ def __init__(self):
27
+ self.available_apis = {
28
+ "nebius": {
29
+ "name": "Nebius AI",
30
+ "base_url": "https://api.nebius.ai/v1",
31
+ "models": ["neural-chat-7b-v3-1", "llama-2-70b-chat", "mistral-7b-instruct"],
32
+ "headers": {"Content-Type": "application/json"}
33
+ },
34
+ "moonshot": {
35
+ "name": "Moonshot AI",
36
+ "base_url": "https://api.moonshot.cn/v1",
37
+ "models": ["moonshot-v1-8k", "moonshot-v1-32k", "moonshot-v1-128k"],
38
+ "headers": {"Content-Type": "application/json"}
39
+ },
40
+ "openai": {
41
+ "name": "OpenAI",
42
+ "base_url": "https://api.openai.com/v1",
43
+ "models": ["gpt-4", "gpt-3.5-turbo", "gpt-4-turbo"],
44
+ "headers": {"Content-Type": "application/json"}
45
+ },
46
+ "anthropic": {
47
+ "name": "Anthropic",
48
+ "base_url": "https://api.anthropic.com/v1",
49
+ "models": ["claude-3-opus-20240229", "claude-3-sonnet-20240229", "claude-3-haiku-20240307"],
50
+ "headers": {"Content-Type": "application/json", "anthropic-version": "2023-06-01"}
51
+ },
52
+ "deepseek": {
53
+ "name": "DeepSeek",
54
+ "base_url": "https://api.deepseek.com/v1",
55
+ "models": ["deepseek-chat", "deepseek-coder"],
56
+ "headers": {"Content-Type": "application/json"}
57
+ }
58
  }
 
 
 
 
 
59
 
60
+ # Para Kimi, necesitamos configurar un endpoint específico
61
+ self.custom_models = {
62
+ "moonshotai/Kimi-K2-Instruct": {
63
+ "provider": "moonshot",
64
+ "model_id": "moonshot-v1-8k", # Asumiendo que es compatible
65
+ "requires_special_handling": True
66
+ }
 
67
  }
68
+
69
+ async def call_api(self, provider: str, api_key: str, model: str,
70
+ messages: List[Dict], max_tokens: int = 1000) -> Optional[str]:
71
+ """Llamar a la API del proveedor seleccionado"""
72
+ if provider not in self.available_apis and provider not in ["custom", "moonshot"]:
73
+ logger.error(f"Proveedor no soportado: {provider}")
74
+ return None
75
 
76
  try:
77
+ # Manejo especial para Kimi
78
+ if model == "moonshotai/Kimi-K2-Instruct":
79
+ return await self._call_moonshot_kimi(api_key, messages, max_tokens)
80
+
81
+ # Configuración según el proveedor
82
+ if provider in ["moonshot", "custom"]:
83
+ base_url = self.available_apis["moonshot"]["base_url"]
84
+ headers = {
85
+ "Authorization": f"Bearer {api_key}",
86
+ "Content-Type": "application/json"
87
+ }
88
+ else:
89
+ api_config = self.available_apis[provider]
90
+ base_url = api_config["base_url"]
91
+ headers = {**api_config["headers"], "Authorization": f"Bearer {api_key}"}
92
+
93
+ # Preparar payload
94
+ payload = {
95
+ "model": model,
96
+ "messages": messages,
97
+ "max_tokens": max_tokens,
98
+ "temperature": 0.7,
99
+ "top_p": 0.95
100
+ }
101
+
102
+ # Realizar la llamada
103
+ url = f"{base_url}/chat/completions"
104
+
105
  async with aiohttp.ClientSession() as session:
106
  async with session.post(
107
+ url,
108
+ headers=headers,
109
  json=payload,
110
  timeout=30
111
  ) as response:
 
114
  return data.get("choices", [{}])[0].get("message", {}).get("content", "")
115
  else:
116
  error_text = await response.text()
117
+ logger.error(f"API Error {response.status}: {error_text}")
118
+ return None
119
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
  except Exception as e:
121
+ logger.error(f"Error calling API {provider}: {e}")
122
+ return None
 
123
 
124
+ async def _call_moonshot_kimi(self, api_key: str, messages: List[Dict], max_tokens: int) -> Optional[str]:
125
+ """Llamada específica para Kimi de Moonshot"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
  try:
127
+ url = "https://api.moonshot.cn/v1/chat/completions"
128
+ headers = {
129
+ "Authorization": f"Bearer {api_key}",
130
+ "Content-Type": "application/json"
131
+ }
132
+
133
+ payload = {
134
+ "model": "moonshot-v1-8k", # Modelo base para Kimi
135
+ "messages": messages,
136
+ "max_tokens": max_tokens,
137
+ "temperature": 0.7,
138
+ "top_p": 0.95
139
+ }
140
+
141
+ async with aiohttp.ClientSession() as session:
142
+ async with session.post(
143
+ url,
144
+ headers=headers,
145
+ json=payload,
146
+ timeout=30
147
+ ) as response:
148
+ if response.status == 200:
149
+ data = await response.json()
150
+ return data.get("choices", [{}])[0].get("message", {}).get("content", "")
151
+ else:
152
+ error_text = await response.text()
153
+ logger.error(f"Kimi API Error {response.status}: {error_text}")
154
+ return None
155
+
156
  except Exception as e:
157
+ logger.error(f"Error calling Kimi API: {e}")
158
+ return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159
 
160
+ # ========== EXTRACTOR DE REFERENCIAS ==========
161
 
162
+ class ReferenceExtractor:
163
+ """Extrae referencias bibliográficas de texto"""
 
 
 
 
 
 
 
 
 
 
 
164
 
165
  def __init__(self):
 
 
166
  self.patterns = {
167
+ "doi": [
168
  r'\b10\.\d{4,9}/[-._;()/:A-Z0-9]+\b',
169
  r'doi:\s*(10\.\d{4,9}/[-._;()/:A-Z0-9]+)',
170
+ r'DOI:\s*(10\.\d{4,9}/[-._;()/:A-Z0-9]+)'
171
  ],
172
+ "arxiv": [
 
 
 
173
  r'arXiv:\s*(\d{4}\.\d{4,5}(v\d+)?)',
174
+ r'arxiv:\s*([a-z\-]+/\d{7})',
175
+ r'\b\d{4}\.\d{4,5}(v\d+)?\b'
176
+ ],
177
+ "isbn": [
178
+ r'ISBN(?:-1[03])?:?\s*(97[89][- ]?)?[0-9]{1,5}[- ]?[0-9]+[- ]?[0-9]+[- ]?[0-9X]',
179
+ r'\b(?:97[89][- ]?)?[0-9]{1,5}[- ]?[0-9]+[- ]?[0-9]+[- ]?[0-9X]\b'
180
  ],
181
+ "url": [
182
+ r'https?://[^\s<>"]+|www\.[^\s<>"]+'
183
+ ],
184
+ "pmid": [
185
+ r'PMID:\s*(\d+)',
186
+ r'PubMed ID:\s*(\d+)'
187
+ ]
188
  }
189
 
190
+ def extract_from_text(self, text: str) -> Dict[str, List[str]]:
191
+ """Extrae todos los identificadores del texto"""
192
+ results = {}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
193
 
194
+ for ref_type, patterns in self.patterns.items():
195
+ matches = []
196
  for pattern in patterns:
197
+ found = re.findall(pattern, text, re.IGNORECASE)
198
+ # Limpiar los resultados
199
+ for match in found:
200
+ if isinstance(match, tuple):
201
+ match = match[0]
202
+ if match:
203
+ match = self._clean_identifier(match, ref_type)
204
+ if match and match not in matches:
205
+ matches.append(match)
206
+
207
+ if matches:
208
+ results[ref_type] = matches
209
+
210
+ return results
211
+
212
+ def _clean_identifier(self, identifier: str, ref_type: str) -> str:
213
+ """Limpia el identificador"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
214
  identifier = identifier.strip()
215
 
216
+ # Eliminar prefijos
217
  prefixes = ['doi:', 'DOI:', 'arxiv:', 'arXiv:', 'isbn:', 'ISBN:', 'pmid:', 'PMID:']
218
  for prefix in prefixes:
219
  if identifier.startswith(prefix):
220
  identifier = identifier[len(prefix):].strip()
221
 
222
+ # Limpiar caracteres
223
  identifier = identifier.strip('"\'<>()[]{}')
224
 
225
+ # Para URLs, asegurar protocolo
226
+ if ref_type == "url" and not identifier.startswith(('http://', 'https://')):
227
+ identifier = f"https://{identifier}"
228
 
229
  return identifier
 
 
 
 
 
 
230
 
231
+ # ========== VERIFICADOR DE REFERENCIAS ==========
232
+
233
+ class ReferenceVerifier:
234
+ """Verifica y descarga referencias"""
 
 
 
 
 
 
 
 
 
235
 
236
  def __init__(self):
 
237
  self.headers = {
238
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
239
  }
240
 
241
+ async def verify_doi(self, doi: str) -> Dict[str, Any]:
242
+ """Verifica un DOI y obtiene metadatos"""
243
+ import requests
244
+
245
  result = {
246
+ "identifier": doi,
247
+ "type": "doi",
248
  "verified": False,
249
+ "metadata": {},
250
  "download_url": None,
251
+ "error": None
 
 
 
 
252
  }
253
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
254
  try:
255
+ # Intentar con Crossref
256
  url = f"https://api.crossref.org/works/{doi}"
257
  response = requests.get(url, headers=self.headers, timeout=10)
258
 
 
260
  data = response.json()
261
  work = data.get('message', {})
262
 
263
+ result["verified"] = True
264
+ result["metadata"] = {
265
+ "title": work.get('title', [''])[0],
266
+ "authors": work.get('author', []),
267
+ "journal": work.get('container-title', [''])[0],
268
+ "year": work.get('published', {}).get('date-parts', [[None]])[0][0],
269
+ "url": work.get('URL')
270
+ }
271
 
272
  # Buscar PDF
273
  links = work.get('link', [])
274
  for link in links:
275
  if link.get('content-type') == 'application/pdf':
276
  result["download_url"] = link.get('URL')
 
277
  break
278
 
279
+ # Si no hay PDF en Crossref, probar Unpaywall
280
+ if not result["download_url"]:
281
+ unpaywall_url = f"https://api.unpaywall.org/v2/{doi}[email protected]"
282
+ unpaywall_response = requests.get(unpaywall_url, timeout=10)
283
+ if unpaywall_response.status_code == 200:
284
+ unpaywall_data = unpaywall_response.json()
285
+ if unpaywall_data.get('is_oa'):
286
+ result["download_url"] = unpaywall_data.get('best_oa_location', {}).get('url')
287
+
288
+ else:
289
+ result["error"] = f"Crossref API returned {response.status_code}"
290
+
291
  except Exception as e:
292
+ result["error"] = str(e)
293
 
294
+ return result
295
 
296
+ async def verify_arxiv(self, arxiv_id: str) -> Dict[str, Any]:
297
+ """Verifica un arXiv ID"""
298
  import requests
299
 
300
+ result = {
301
+ "identifier": arxiv_id,
302
+ "type": "arxiv",
303
+ "verified": False,
304
+ "metadata": {},
305
+ "download_url": None,
306
+ "error": None
307
+ }
308
+
309
  try:
310
  # Limpiar ID
311
  if 'arxiv:' in arxiv_id.lower():
312
  arxiv_id = arxiv_id.split(':')[-1].strip()
313
 
314
+ # Obtener metadatos
315
  api_url = f"http://export.arxiv.org/api/query?id_list={arxiv_id}"
316
  response = requests.get(api_url, headers=self.headers, timeout=10)
317
 
318
  if response.status_code == 200:
319
+ result["verified"] = True
320
+ result["download_url"] = f"https://arxiv.org/pdf/{arxiv_id}.pdf"
321
+
322
+ # Parsear metadatos básicos del XML
323
+ import xml.etree.ElementTree as ET
324
+ root = ET.fromstring(response.text)
325
+ ns = {'atom': 'http://www.w3.org/2005/Atom'}
326
+
327
+ entry = root.find('.//atom:entry', ns)
328
+ if entry is not None:
329
+ title = entry.find('atom:title', ns)
330
+ if title is not None:
331
+ result["metadata"]["title"] = title.text
332
+
333
+ summary = entry.find('atom:summary', ns)
334
+ if summary is not None:
335
+ result["metadata"]["abstract"] = summary.text
336
+
337
+ else:
338
+ result["error"] = f"arXiv API returned {response.status_code}"
339
+
340
  except Exception as e:
341
+ result["error"] = str(e)
342
 
343
+ return result
344
 
345
+ async def download_paper(self, url: str, filename: str) -> Optional[str]:
346
+ """Descarga un paper desde una URL"""
347
  import requests
348
+ import os
349
 
350
  try:
351
+ response = requests.get(url, headers=self.headers, stream=True, timeout=30)
352
 
353
  if response.status_code == 200:
354
+ # Crear directorio de descargas si no existe
355
+ os.makedirs("downloads", exist_ok=True)
356
 
357
+ # Determinar extensión
358
  content_type = response.headers.get('content-type', '')
359
  if 'application/pdf' in content_type:
360
+ ext = '.pdf'
361
+ elif 'application/epub' in content_type:
362
+ ext = '.epub'
363
+ else:
364
+ ext = '.pdf' # Por defecto
365
+
366
+ filepath = os.path.join("downloads", f"{filename}{ext}")
367
+
368
+ with open(filepath, 'wb') as f:
369
+ for chunk in response.iter_content(chunk_size=8192):
370
+ if chunk:
371
+ f.write(chunk)
372
+
373
+ return filepath
374
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
375
  except Exception as e:
376
+ logger.error(f"Error downloading {url}: {e}")
377
 
378
+ return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
379
 
380
+ # ========== SISTEMA PRINCIPAL ==========
381
 
382
+ class BibliographySystem:
383
+ """Sistema principal de procesamiento bibliográfico"""
384
 
385
+ def __init__(self):
386
+ self.extractor = ReferenceExtractor()
387
+ self.verifier = ReferenceVerifier()
388
+ self.api_provider = APIProvider()
389
+
390
+ # Directorios
391
+ os.makedirs("downloads", exist_ok=True)
392
+ os.makedirs("reports", exist_ok=True)
393
+
394
+ async def process_document(self, text: str, use_ai: bool = False,
395
+ api_provider: str = "openai", api_key: str = "",
396
+ api_model: str = "") -> Dict[str, Any]:
397
+ """Procesa un documento y extrae referencias"""
398
+ start_time = datetime.now()
399
+
400
+ # 1. Extraer referencias
401
+ logger.info("Extracting references...")
402
+ references = self.extractor.extract_from_text(text)
403
+
404
+ total_refs = sum(len(v) for v in references.values())
405
+ logger.info(f"Found {total_refs} references")
406
+
407
+ # 2. Verificar referencias
408
+ logger.info("Verifying references...")
409
+ verified_refs = []
410
+ download_tasks = []
411
+
412
+ # Procesar DOIs
413
+ for doi in references.get("doi", []):
414
+ result = await self.verifier.verify_doi(doi)
415
+ if result["verified"]:
416
+ verified_refs.append(result)
417
+ if result["download_url"]:
418
+ # Programar descarga
419
+ filename = hashlib.md5(doi.encode()).hexdigest()[:8]
420
+ download_tasks.append(
421
+ self.verifier.download_paper(result["download_url"], filename)
422
+ )
423
 
424
+ # Procesar arXiv
425
+ for arxiv_id in references.get("arxiv", []):
426
+ result = await self.verifier.verify_arxiv(arxiv_id)
427
+ if result["verified"]:
428
+ verified_refs.append(result)
429
+ if result["download_url"]:
430
+ filename = hashlib.md5(arxiv_id.encode()).hexdigest()[:8]
431
+ download_tasks.append(
432
+ self.verifier.download_paper(result["download_url"], filename)
433
+ )
434
 
435
+ # 3. Usar IA para análisis si está activado
436
+ ai_analysis = None
437
+ if use_ai and api_key and api_provider:
438
+ logger.info("Using AI for analysis...")
439
+ ai_analysis = await self._analyze_with_ai(
440
+ text, references, verified_refs,
441
+ api_provider, api_key, api_model
442
+ )
443
 
444
+ # 4. Descargar archivos
445
+ logger.info("Downloading files...")
446
+ downloaded_files = []
447
+ if download_tasks:
448
+ download_results = await asyncio.gather(*download_tasks)
449
+ downloaded_files = [r for r in download_results if r]
450
+
451
+ # 5. Crear reporte
452
+ processing_time = (datetime.now() - start_time).total_seconds()
453
+
454
+ report = {
455
+ "timestamp": datetime.now().isoformat(),
456
+ "processing_time": processing_time,
457
+ "total_references_found": total_refs,
458
+ "references_by_type": references,
459
+ "verified_references": len(verified_refs),
460
+ "verification_details": verified_refs,
461
+ "downloaded_files": downloaded_files,
462
+ "ai_analysis": ai_analysis,
463
+ "statistics": {
464
+ "verification_rate": len(verified_refs) / max(1, total_refs),
465
+ "download_rate": len(downloaded_files) / max(1, len(verified_refs))
466
+ }
467
  }
468
 
469
+ # 6. Guardar reporte
470
+ report_filename = f"report_{hashlib.md5(text.encode()).hexdigest()[:8]}.json"
471
+ report_path = os.path.join("reports", report_filename)
 
 
472
 
473
+ with open(report_path, 'w', encoding='utf-8') as f:
474
+ json.dump(report, f, indent=2, ensure_ascii=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
475
 
476
+ # 7. Crear ZIP
477
+ zip_path = self._create_zip(report, downloaded_files)
478
 
479
+ return {
480
+ "success": True,
481
+ "report": report,
482
+ "zip_path": zip_path,
483
+ "summary": {
484
+ "found": total_refs,
485
+ "verified": len(verified_refs),
486
+ "downloaded": len(downloaded_files),
487
+ "time": f"{processing_time:.2f}s"
488
+ }
489
+ }
490
+
491
+ async def _analyze_with_ai(self, text: str, references: Dict,
492
+ verified_refs: List, api_provider: str,
493
+ api_key: str, api_model: str) -> Optional[Dict]:
494
+ """Analiza el documento con IA"""
495
  try:
496
+ # Preparar prompt
497
+ prompt = f"""Analiza el siguiente documento académico y sus referencias:
498
+
499
+ Documento (primeros 2000 caracteres):
500
+ {text[:2000]}...
501
+
502
+ Referencias encontradas:
503
+ {json.dumps(references, indent=2, ensure_ascii=False)}
504
+
505
+ Referencias verificadas: {len(verified_refs)}
506
+
507
+ Proporciona un análisis que incluya:
508
+ 1. Temas principales del documento
509
+ 2. Calidad de las referencias (relevancia, actualidad)
510
+ 3. Sugerencias de referencias faltantes
511
+ 4. Evaluación general de la solidez bibliográfica
512
+
513
+ Responde en formato JSON con las siguientes claves:
514
+ - main_topics (lista de temas)
515
+ - reference_quality (score 1-10)
516
+ - missing_references (sugerencias)
517
+ - overall_assessment (texto)
518
+ - recommendations (lista)"""
519
+
520
+ messages = [
521
+ {"role": "system", "content": "Eres un experto en análisis bibliográfico académico."},
522
+ {"role": "user", "content": prompt}
523
+ ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
524
 
525
+ # Llamar a la API
526
+ analysis_text = await self.api_provider.call_api(
527
+ api_provider, api_key, api_model, messages, max_tokens=1500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
528
  )
529
 
530
+ if analysis_text:
531
+ # Intentar extraer JSON
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
532
  try:
533
+ # Buscar JSON en la respuesta
534
+ json_match = re.search(r'\{.*\}', analysis_text, re.DOTALL)
535
+ if json_match:
536
+ return json.loads(json_match.group())
537
+ else:
538
+ return {"raw_analysis": analysis_text}
539
+ except:
540
+ return {"raw_analysis": analysis_text}
 
 
 
 
 
 
 
 
 
541
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
542
  except Exception as e:
543
+ logger.error(f"AI analysis error: {e}")
 
 
 
 
 
544
 
545
+ return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
546
 
547
+ def _create_zip(self, report: Dict, downloaded_files: List[str]) -> str:
548
+ """Crea un archivo ZIP con los resultados"""
549
  import zipfile
550
+ from datetime import datetime
551
 
552
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
553
+ zip_filename = f"bibliography_results_{timestamp}.zip"
554
 
555
+ with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
556
+ # Agregar reporte JSON
557
+ report_path = os.path.join("reports", f"report_{timestamp}.json")
558
+ with open(report_path, 'w', encoding='utf-8') as f:
559
+ json.dump(report, f, indent=2, ensure_ascii=False)
560
+ zipf.write(report_path, "report.json")
 
 
 
 
561
 
562
  # Agregar archivos descargados
563
+ for file_path in downloaded_files:
564
  if os.path.exists(file_path):
565
+ zipf.write(file_path, f"downloads/{os.path.basename(file_path)}")
 
566
 
567
+ # Agregar resumen en texto
568
+ summary = self._generate_summary_text(report)
569
+ zipf.writestr("summary.txt", summary)
 
 
 
 
 
570
 
571
+ return zip_filename
572
 
573
+ def _generate_summary_text(self, report: Dict) -> str:
574
+ """Genera un resumen en texto"""
575
+ return f"""
576
+ RESUMEN DE PROCESAMIENTO BIBLIOGRÁFICO
577
+ ======================================
 
 
 
 
 
 
 
 
 
 
 
 
 
578
 
579
+ Fecha: {report.get('timestamp', 'N/A')}
580
+ Tiempo de procesamiento: {report.get('processing_time', 0):.2f} segundos
 
 
 
581
 
582
+ ESTADÍSTICAS:
583
+ ------------
584
+ • Referencias encontradas: {report.get('total_references_found', 0)}
585
+ • Referencias verificadas: {report.get('verified_references', 0)}
586
+ • Archivos descargados: {len(report.get('downloaded_files', []))}
587
+ • Tasa de verificación: {report.get('statistics', {}).get('verification_rate', 0) * 100:.1f}%
588
+ • Tasa de descarga: {report.get('statistics', {}).get('download_rate', 0) * 100:.1f}%
589
+
590
+ REFERENCIAS POR TIPO:
591
+ ---------------------
592
+ {json.dumps(report.get('references_by_type', {}), indent=2, ensure_ascii=False)}
593
+
594
+ Para más detalles, consulte el reporte JSON incluido.
595
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
596
 
597
+ # ========== INTERFAZ GRADIO SIMPLIFICADA ==========
598
 
599
+ def create_simple_interface():
600
+ """Crea una interfaz Gradio simple y funcional"""
601
 
602
+ system = BibliographySystem()
 
603
 
604
+ async def process_text(text_input, use_ai, api_provider, api_key, api_model):
605
+ """Procesa el texto ingresado"""
606
+ if not text_input.strip():
607
+ return None, "❌ Error: No se ingresó texto", "", "", {}
 
 
 
 
 
 
 
 
608
 
609
  try:
610
+ result = await system.process_document(
611
+ text_input, use_ai, api_provider, api_key, api_model
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
612
  )
613
 
614
+ if result["success"]:
615
+ summary = result["summary"]
616
+
617
+ # Generar HTML para visualización
618
+ html_output = f"""
619
+ <div style="font-family: Arial, sans-serif; padding: 20px;">
620
+ <h2 style="color: #2c3e50;">📊 Resultados del Procesamiento</h2>
621
+
622
+ <div style="background: #ecf0f1; padding: 15px; border-radius: 10px; margin: 15px 0;">
623
+ <h3 style="color: #34495e;">📈 Estadísticas</h3>
624
+ <div style="display: grid; grid-template-columns: repeat(2, 1fr); gap: 10px;">
625
+ <div style="background: white; padding: 10px; border-radius: 5px;">
626
+ <strong>Referencias Encontradas</strong><br>
627
+ <span style="font-size: 24px; color: #3498db;">{summary['found']}</span>
628
+ </div>
629
+ <div style="background: white; padding: 10px; border-radius: 5px;">
630
+ <strong>Verificadas</strong><br>
631
+ <span style="font-size: 24px; color: #2ecc71;">{summary['verified']}</span>
632
+ </div>
633
+ <div style="background: white; padding: 10px; border-radius: 5px;">
634
+ <strong>Descargadas</strong><br>
635
+ <span style="font-size: 24px; color: #9b59b6;">{summary['downloaded']}</span>
636
+ </div>
637
+ <div style="background: white; padding: 10px; border-radius: 5px;">
638
+ <strong>Tiempo</strong><br>
639
+ <span style="font-size: 24px; color: #e74c3c;">{summary['time']}</span>
640
+ </div>
641
+ </div>
 
 
 
 
 
 
642
  </div>
643
  </div>
644
+ """
645
+
646
+ # Generar texto simple
647
+ text_output = f"""
648
+ Procesamiento completado exitosamente.
649
+
650
+ Referencias encontradas: {summary['found']}
651
+ • Referencias verificadas: {summary['verified']}
652
+ • Archivos descargados: {summary['downloaded']}
653
+ Tiempo de procesamiento: {summary['time']}
654
+
655
+ El archivo ZIP con los resultados está listo para descargar.
656
+ """
657
+
658
+ # JSON del reporte (limitado)
659
+ report_json = json.dumps(result["report"], indent=2, ensure_ascii=False)
660
+ if len(report_json) > 5000:
661
+ report_json = report_json[:5000] + "\n... (reporte truncado por tamaño)"
662
+
663
+ return result["zip_path"], "✅ Procesamiento completado", html_output, text_output, report_json
 
 
664
 
665
+ else:
666
+ return None, f"❌ Error: {result.get('error', 'Error desconocido')}", "", "", {}
667
+
668
+ except Exception as e:
669
+ logger.error(f"Processing error: {e}")
670
+ return None, f"❌ Error: {str(e)}", "", "", {}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
671
 
672
+ # Crear interfaz simple
673
+ with gr.Blocks(title="Sistema de Recopilación Bibliográfica", theme=gr.themes.Soft()) as interface:
674
+ gr.Markdown("# 📚 Sistema de Recopilación Bibliográfica")
675
+ gr.Markdown("Extrae, verifica y descarga referencias académicas de textos")
676
 
677
  with gr.Row():
678
  with gr.Column(scale=1):
679
+ gr.Markdown("### ⚙️ Configuración")
680
 
681
+ use_ai = gr.Checkbox(
682
+ label="Usar IA para análisis avanzado",
683
+ value=False
 
 
684
  )
685
 
686
+ api_provider = gr.Dropdown(
687
+ choices=["openai", "moonshot", "nebius", "anthropic", "deepseek"],
688
+ label="Proveedor de IA",
689
+ value="moonshot"
690
  )
691
 
692
+ api_key = gr.Textbox(
693
+ label="API Key",
694
  type="password",
695
+ placeholder="Ingresa tu API key"
696
  )
697
 
698
+ api_model = gr.Textbox(
699
+ label="Modelo (opcional)",
700
+ value="moonshotai/Kimi-K2-Instruct",
701
+ placeholder="Deja vacío para usar el modelo por defecto"
702
  )
703
 
704
+ gr.Markdown("""
705
+ ### 🔑 APIs Soportadas
706
+ - **Moonshot**: moonshotai/Kimi-K2-Instruct
707
+ - **Nebius**: neural-chat-7b-v3-1
708
+ - **OpenAI**: gpt-4, gpt-3.5-turbo
709
+ - **Anthropic**: Claude 3
710
+ - **DeepSeek**: deepseek-chat
711
+ """)
 
 
 
 
712
 
713
  with gr.Column(scale=2):
714
+ gr.Markdown("### 📄 Ingresar Texto")
 
 
 
 
 
715
 
716
+ text_input = gr.Textbox(
717
+ label="Texto con referencias bibliográficas",
718
+ placeholder="Pega aquí tu texto con referencias académicas...",
719
+ lines=15,
720
+ max_lines=50
721
  )
722
 
723
+ process_btn = gr.Button("🔍 Procesar Texto", variant="primary")
724
 
725
  gr.Markdown("### 📦 Resultados")
726
 
727
+ result_file = gr.File(label="Descargar Resultados (ZIP)")
728
+ result_status = gr.Markdown()
 
729
 
730
  with gr.Tabs():
731
  with gr.TabItem("📋 Vista HTML"):
732
+ html_output = gr.HTML(label="Resultados Visuales")
733
 
734
+ with gr.TabItem("📝 Texto"):
735
  text_output = gr.Textbox(
736
  label="Resumen",
737
+ lines=10,
738
+ max_lines=20
739
  )
740
 
741
+ with gr.TabItem("🔧 JSON"):
742
  json_output = gr.Code(
743
+ label="Datos del Reporte",
744
  language="json",
745
+ lines=15
746
  )
747
 
748
  # Conectar eventos
 
 
 
 
 
 
749
  process_btn.click(
750
+ process_text,
751
+ inputs=[text_input, use_ai, api_provider, api_key, api_model],
752
+ outputs=[result_file, result_status, html_output, text_output, json_output]
753
  )
754
 
755
+ # Ejemplos
756
+ gr.Markdown("### 📖 Ejemplo de Texto")
757
+ gr.Examples(
758
+ examples=[["""Este es un ejemplo de texto con referencias académicas.
759
+
760
+ 1. El paper seminal de AlexNet (Krizhevsky et al., 2012) tiene DOI: 10.1145/3065386
761
+
762
+ 2. El trabajo sobre Transformers está en arXiv: arXiv:1706.03762
763
+
764
+ 3. El libro de Deep Learning tiene ISBN: 978-0262035613
765
+
766
+ 4. Más referencias:
767
+ - DOI: 10.1038/nature14539
768
+ - DOI: 10.1109/CVPR.2016.90
769
+ - arXiv: 1506.02640
770
+
771
+ URLs académicas:
772
+ - https://arxiv.org/abs/1706.03762
773
+ - https://doi.org/10.1145/3065386"""]],
774
+ inputs=[text_input],
775
+ label="Ejemplo básico"
776
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
777
 
778
  return interface
779
 
780
  # ========== EJECUCIÓN PRINCIPAL ==========
781
 
782
+ def main():
783
  """Función principal"""
784
+ # Crear e iniciar la interfaz
785
+ interface = create_simple_interface()
786
+
787
+ # Configuración para Hugging Face Spaces
788
+ interface.launch(
789
+ server_name="0.0.0.0",
790
+ server_port=7860,
791
+ share=False, # Desactivar share en Spaces
792
+ debug=False
793
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
794
 
795
  if __name__ == "__main__":
796
+ main()