Spaces:

AethronPhantom
/

Nexa_Data_Studio

Running

App Files Files Community

Nexa_Data_Studio / app.py

Allanatrix

Upload app.py with huggingface_hub

7917ec8 verified 11 days ago

raw

history blame contribute delete

14.4 kB

	"""
	Nexa Data Studio — Scientific Dataset Generator
	Aethron Labs \| No payment required, fully functional synthetic data generation.
	"""

	import gradio as gr
	import json
	import csv
	import io
	import random
	import math
	import time
	import tempfile
	import os
	from datetime import datetime

	# ── Synthetic data generators ──────────────────────────────────────────────

	def _gaussian_noise(n, dim, noise=0.05):
	return [[round(random.gauss(0, 1) + random.gauss(0, noise), 4) for _ in range(dim)] for _ in range(n)]

	def generate_regression(n_samples, n_features, noise_level, seed):
	random.seed(seed)
	records = []
	weights = [random.uniform(-2, 2) for _ in range(n_features)]
	for i in range(n_samples):
	x = [round(random.gauss(0, 1), 4) for _ in range(n_features)]
	y = sum(w * xi for w, xi in zip(weights, x)) + random.gauss(0, noise_level)
	records.append({f"x{j+1}": x[j] for j in range(n_features)} \| {"y": round(y, 4), "sample_id": i})
	return records

	def generate_classification(n_samples, n_classes, n_features, noise_level, seed):
	random.seed(seed)
	records = []
	centers = [[random.uniform(-4, 4) for _ in range(n_features)] for _ in range(n_classes)]
	for i in range(n_samples):
	cls = random.randint(0, n_classes - 1)
	x = [round(centers[cls][j] + random.gauss(0, 1 + noise_level), 4) for j in range(n_features)]
	records.append({f"x{j+1}": x[j] for j in range(n_features)} \| {"label": cls, "sample_id": i})
	return records

	def generate_timeseries(n_samples, n_series, noise_level, seed):
	random.seed(seed)
	records = []
	for s in range(n_series):
	freq = random.uniform(0.05, 0.3)
	amp = random.uniform(0.5, 2.0)
	phase = random.uniform(0, 2 * math.pi)
	for t in range(n_samples):
	val = amp * math.sin(2 * math.pi * freq * t + phase) + random.gauss(0, noise_level)
	records.append({"series_id": s, "timestep": t, "value": round(val, 4)})
	return records

	def generate_molecular(n_samples, seed):
	random.seed(seed)
	elements = ["C", "H", "O", "N", "S", "P", "F", "Cl"]
	records = []
	for i in range(n_samples):
	n_atoms = random.randint(5, 20)
	formula = "".join(
	f"{e}{random.randint(1,6)}" for e in random.sample(elements, random.randint(2, 4))
	)
	mw = round(random.uniform(50, 500), 2)
	logp = round(random.gauss(2.0, 1.5), 3)
	tpsa = round(random.uniform(20, 150), 2)
	hbd = random.randint(0, 5)
	hba = random.randint(0, 10)
	records.append({
	"sample_id": i, "formula": formula, "n_atoms": n_atoms,
	"mol_weight": mw, "logP": logp, "TPSA": tpsa,
	"HBD": hbd, "HBA": hba,
	"lipinski_pass": int(mw <= 500 and logp <= 5 and hbd <= 5 and hba <= 10)
	})
	return records

	def generate_pde_field(n_samples, grid_size, noise_level, seed):
	random.seed(seed)
	records = []
	for i in range(n_samples):
	kx = random.uniform(0.5, 3.0)
	ky = random.uniform(0.5, 3.0)
	for gx in range(grid_size):
	for gy in range(grid_size):
	x = gx / grid_size
	y = gy / grid_size
	u = math.sin(kx * math.pi * x) * math.cos(ky * math.pi * y) + random.gauss(0, noise_level)
	records.append({"sample_id": i, "x": round(x, 3), "y": round(y, 3), "u": round(u, 4)})
	return records

	# ── File writers ────────────────────────────────────────────────────────────

	def records_to_jsonl(records):
	return "\n".join(json.dumps(r) for r in records)

	def records_to_csv(records):
	if not records:
	return ""
	buf = io.StringIO()
	writer = csv.DictWriter(buf, fieldnames=records[0].keys())
	writer.writeheader()
	writer.writerows(records)
	return buf.getvalue()

	def save_to_tmp(content, ext):
	tmp = tempfile.NamedTemporaryFile(delete=False, suffix=f".{ext}", mode="w")
	tmp.write(content)
	tmp.close()
	return tmp.name

	# ── Main generation function ────────────────────────────────────────────────

	def run_generation(dataset_type, n_samples, n_features, n_classes, n_series,
	grid_size, noise_level, seed, output_format, progress=gr.Progress()):

	progress(0, desc="Initialising...")
	time.sleep(0.2)
	progress(0.2, desc="Generating samples...")

	try:
	if dataset_type == "Regression":
	records = generate_regression(int(n_samples), int(n_features), float(noise_level), int(seed))
	elif dataset_type == "Classification":
	records = generate_classification(int(n_samples), int(n_classes), int(n_features), float(noise_level), int(seed))
	elif dataset_type == "Time Series":
	records = generate_timeseries(int(n_samples), int(n_series), float(noise_level), int(seed))
	elif dataset_type == "Molecular Properties":
	records = generate_molecular(int(n_samples), int(seed))
	elif dataset_type == "PDE Field (2D)":
	records = generate_pde_field(int(n_samples), int(grid_size), float(noise_level), int(seed))
	else:
	return "Unknown dataset type.", None, ""

	progress(0.7, desc="Serialising output...")
	time.sleep(0.1)

	if output_format == "JSONL":
	content = records_to_jsonl(records)
	ext = "jsonl"
	else:
	content = records_to_csv(records)
	ext = "csv"

	progress(0.9, desc="Writing file...")
	filepath = save_to_tmp(content, ext)

	progress(1.0, desc="Done!")
	preview = "\n".join(json.dumps(r) for r in records[:5])
	status = (
	f"Generated {len(records):,} records · {dataset_type} · "
	f"{output_format} · seed={seed} · {datetime.utcnow().strftime('%H:%M:%S UTC')}"
	)
	return status, filepath, preview

	except Exception as e:
	return f"Error: {e}", None, ""

	# ── Label uploaded data ─────────────────────────────────────────────────────

	def label_uploaded(file, label_col_name, n_classes, seed, progress=gr.Progress()):
	if file is None:
	return "No file uploaded.", None, ""

	progress(0, desc="Reading file...")
	try:
	with open(file.name, "r") as f:
	first_line = f.readline().strip()
	# Detect JSONL vs CSV
	try:
	json.loads(first_line)
	is_jsonl = True
	except Exception:
	is_jsonl = False

	records = []
	with open(file.name, "r") as f:
	if is_jsonl:
	for line in f:
	line = line.strip()
	if line:
	records.append(json.loads(line))
	else:
	reader = csv.DictReader(f)
	records = list(reader)

	progress(0.5, desc="Assigning labels...")
	random.seed(seed)
	for r in records:
	r[label_col_name] = random.randint(0, int(n_classes) - 1)

	progress(0.85, desc="Writing output...")
	content = records_to_jsonl(records) if is_jsonl else records_to_csv(records)
	ext = "jsonl" if is_jsonl else "csv"
	filepath = save_to_tmp(content, ext)

	progress(1.0, desc="Done!")
	preview = "\n".join(json.dumps(r) for r in records[:5])
	status = f"Labelled {len(records):,} records with {n_classes} classes → column '{label_col_name}'"
	return status, filepath, preview

	except Exception as e:
	return f"Error: {e}", None, ""

	# ── Gradio UI ───────────────────────────────────────────────────────────────

	CSS = """
	body, .gradio-container { background: #070a12 !important; color: #e8eaf6 !important; }
	.gradio-container { max-width: 960px !important; margin: 0 auto !important; }
	h1, h2, h3 { font-family: 'Space Mono', monospace !important; }
	.gr-button-primary { background: #7c5cfc !important; border-color: #7c5cfc !important; }
	.gr-button-primary:hover { background: #9b7ffe !important; }
	footer { display: none !important; }
	"""

	with gr.Blocks(title="Nexa Data Studio", css=CSS, theme=gr.themes.Base()) as demo:

	gr.Markdown("""
	# ⬡ Nexa Data Studio
	Scientific Dataset Generator · Aethron Labs
	Generate synthetic datasets for ML research — regression, classification, time series, molecular, and PDE fields. No payment required.
	---
	""")

	with gr.Tabs():

	# ── TAB 1: Generate ──────────────────────────────────────────────
	with gr.TabItem("Generate Dataset"):
	with gr.Row():
	with gr.Column(scale=1):
	dataset_type = gr.Dropdown(
	["Regression", "Classification", "Time Series", "Molecular Properties", "PDE Field (2D)"],
	label="Dataset Type", value="Regression"
	)
	n_samples = gr.Slider(50, 5000, value=500, step=50, label="Number of Samples")
	output_format = gr.Radio(["JSONL", "CSV"], value="JSONL", label="Output Format")
	noise_level = gr.Slider(0.0, 2.0, value=0.1, step=0.05, label="Noise Level (σ)")
	seed = gr.Number(value=42, label="Random Seed", precision=0)

	with gr.Column(scale=1):
	with gr.Group() as reg_cls_opts:
	n_features = gr.Slider(1, 20, value=4, step=1, label="Number of Features")
	with gr.Group(visible=False) as cls_opts:
	n_classes = gr.Slider(2, 10, value=3, step=1, label="Number of Classes")
	with gr.Group(visible=False) as ts_opts:
	n_series = gr.Slider(1, 20, value=3, step=1, label="Number of Series")
	with gr.Group(visible=False) as pde_opts:
	grid_size = gr.Slider(4, 32, value=8, step=2, label="Grid Size (NxN)")

	def update_opts(dtype):
	show_feat = dtype in ["Regression", "Classification"]
	show_cls = dtype == "Classification"
	show_ts = dtype == "Time Series"
	show_pde = dtype == "PDE Field (2D)"
	return (
	gr.update(visible=show_feat),
	gr.update(visible=show_cls),
	gr.update(visible=show_ts),
	gr.update(visible=show_pde),
	)

	dataset_type.change(update_opts, dataset_type, [reg_cls_opts, cls_opts, ts_opts, pde_opts])

	gen_btn = gr.Button("Generate Dataset", variant="primary")
	gen_status = gr.Textbox(label="Status", interactive=False)
	gen_file = gr.File(label="Download Generated Dataset")
	gen_preview = gr.Code(label="Preview (first 5 records)", language="json", lines=8)

	gen_btn.click(
	run_generation,
	inputs=[dataset_type, n_samples, n_features, n_classes, n_series, grid_size, noise_level, seed, output_format],
	outputs=[gen_status, gen_file, gen_preview]
	)

	# ── TAB 2: Label Uploaded Data ───────────────────────────────────
	with gr.TabItem("Label Uploaded Data"):
	gr.Markdown("Upload an existing `.jsonl` or `.csv` file and automatically assign random class labels to each record.")
	with gr.Row():
	with gr.Column():
	upload_file = gr.File(label="Upload Dataset (.jsonl or .csv)", file_types=[".jsonl", ".csv"])
	label_col = gr.Textbox(value="label", label="Label Column Name")
	label_classes = gr.Slider(2, 20, value=3, step=1, label="Number of Classes")
	label_seed = gr.Number(value=42, label="Random Seed", precision=0)
	label_btn = gr.Button("Assign Labels", variant="primary")

	label_status = gr.Textbox(label="Status", interactive=False)
	label_file = gr.File(label="Download Labelled Dataset")
	label_preview = gr.Code(label="Preview (first 5 records)", language="json", lines=8)

	label_btn.click(
	label_uploaded,
	inputs=[upload_file, label_col, label_classes, label_seed],
	outputs=[label_status, label_file, label_preview]
	)

	# ── TAB 3: About ─────────────────────────────────────────────────
	with gr.TabItem("About"):
	gr.Markdown("""
	## Nexa Data Studio

	Part of the Nexa Stack by [Aethron Labs](https://huggingface.co/AethronPhantom) — a Scientific Machine Learning Research Lab.

	### Supported Dataset Types

	\| Type \| Description \| Use Case \|
	\|------\|-------------\|----------\|
	\| Regression \| Continuous target from linear combination of features + noise \| Surrogate model training \|
	\| Classification \| Gaussian cluster data with configurable classes \| Classifier benchmarking \|
	\| Time Series \| Multi-series sinusoidal signals with noise \| Forecasting, anomaly detection \|
	\| Molecular Properties \| Synthetic molecular descriptors (MW, logP, TPSA, HBD/HBA) \| Drug discovery ML \|
	\| PDE Field (2D) \| 2D sinusoidal field solutions with noise \| Physics-informed neural networks \|

	### Output Formats
	- JSONL — one JSON object per line, ideal for streaming and LLM fine-tuning pipelines
	- CSV — tabular format for pandas, sklearn, and spreadsheet tools

	### Notes
	All data is synthetically generated — no real molecular structures or physical measurements are included.
	For research use only.
	""")

	demo.launch()