""" Nexa Data Studio — Scientific Dataset Generator Aethron Labs | No payment required, fully functional synthetic data generation. """ import gradio as gr import json import csv import io import random import math import time import tempfile import os from datetime import datetime # ── Synthetic data generators ────────────────────────────────────────────── def _gaussian_noise(n, dim, noise=0.05): return [[round(random.gauss(0, 1) + random.gauss(0, noise), 4) for _ in range(dim)] for _ in range(n)] def generate_regression(n_samples, n_features, noise_level, seed): random.seed(seed) records = [] weights = [random.uniform(-2, 2) for _ in range(n_features)] for i in range(n_samples): x = [round(random.gauss(0, 1), 4) for _ in range(n_features)] y = sum(w * xi for w, xi in zip(weights, x)) + random.gauss(0, noise_level) records.append({f"x{j+1}": x[j] for j in range(n_features)} | {"y": round(y, 4), "sample_id": i}) return records def generate_classification(n_samples, n_classes, n_features, noise_level, seed): random.seed(seed) records = [] centers = [[random.uniform(-4, 4) for _ in range(n_features)] for _ in range(n_classes)] for i in range(n_samples): cls = random.randint(0, n_classes - 1) x = [round(centers[cls][j] + random.gauss(0, 1 + noise_level), 4) for j in range(n_features)] records.append({f"x{j+1}": x[j] for j in range(n_features)} | {"label": cls, "sample_id": i}) return records def generate_timeseries(n_samples, n_series, noise_level, seed): random.seed(seed) records = [] for s in range(n_series): freq = random.uniform(0.05, 0.3) amp = random.uniform(0.5, 2.0) phase = random.uniform(0, 2 * math.pi) for t in range(n_samples): val = amp * math.sin(2 * math.pi * freq * t + phase) + random.gauss(0, noise_level) records.append({"series_id": s, "timestep": t, "value": round(val, 4)}) return records def generate_molecular(n_samples, seed): random.seed(seed) elements = ["C", "H", "O", "N", "S", "P", "F", "Cl"] records = [] for i in range(n_samples): n_atoms = random.randint(5, 20) formula = "".join( f"{e}{random.randint(1,6)}" for e in random.sample(elements, random.randint(2, 4)) ) mw = round(random.uniform(50, 500), 2) logp = round(random.gauss(2.0, 1.5), 3) tpsa = round(random.uniform(20, 150), 2) hbd = random.randint(0, 5) hba = random.randint(0, 10) records.append({ "sample_id": i, "formula": formula, "n_atoms": n_atoms, "mol_weight": mw, "logP": logp, "TPSA": tpsa, "HBD": hbd, "HBA": hba, "lipinski_pass": int(mw <= 500 and logp <= 5 and hbd <= 5 and hba <= 10) }) return records def generate_pde_field(n_samples, grid_size, noise_level, seed): random.seed(seed) records = [] for i in range(n_samples): kx = random.uniform(0.5, 3.0) ky = random.uniform(0.5, 3.0) for gx in range(grid_size): for gy in range(grid_size): x = gx / grid_size y = gy / grid_size u = math.sin(kx * math.pi * x) * math.cos(ky * math.pi * y) + random.gauss(0, noise_level) records.append({"sample_id": i, "x": round(x, 3), "y": round(y, 3), "u": round(u, 4)}) return records # ── File writers ──────────────────────────────────────────────────────────── def records_to_jsonl(records): return "\n".join(json.dumps(r) for r in records) def records_to_csv(records): if not records: return "" buf = io.StringIO() writer = csv.DictWriter(buf, fieldnames=records[0].keys()) writer.writeheader() writer.writerows(records) return buf.getvalue() def save_to_tmp(content, ext): tmp = tempfile.NamedTemporaryFile(delete=False, suffix=f".{ext}", mode="w") tmp.write(content) tmp.close() return tmp.name # ── Main generation function ──────────────────────────────────────────────── def run_generation(dataset_type, n_samples, n_features, n_classes, n_series, grid_size, noise_level, seed, output_format, progress=gr.Progress()): progress(0, desc="Initialising...") time.sleep(0.2) progress(0.2, desc="Generating samples...") try: if dataset_type == "Regression": records = generate_regression(int(n_samples), int(n_features), float(noise_level), int(seed)) elif dataset_type == "Classification": records = generate_classification(int(n_samples), int(n_classes), int(n_features), float(noise_level), int(seed)) elif dataset_type == "Time Series": records = generate_timeseries(int(n_samples), int(n_series), float(noise_level), int(seed)) elif dataset_type == "Molecular Properties": records = generate_molecular(int(n_samples), int(seed)) elif dataset_type == "PDE Field (2D)": records = generate_pde_field(int(n_samples), int(grid_size), float(noise_level), int(seed)) else: return "Unknown dataset type.", None, "" progress(0.7, desc="Serialising output...") time.sleep(0.1) if output_format == "JSONL": content = records_to_jsonl(records) ext = "jsonl" else: content = records_to_csv(records) ext = "csv" progress(0.9, desc="Writing file...") filepath = save_to_tmp(content, ext) progress(1.0, desc="Done!") preview = "\n".join(json.dumps(r) for r in records[:5]) status = ( f"Generated {len(records):,} records · {dataset_type} · " f"{output_format} · seed={seed} · {datetime.utcnow().strftime('%H:%M:%S UTC')}" ) return status, filepath, preview except Exception as e: return f"Error: {e}", None, "" # ── Label uploaded data ───────────────────────────────────────────────────── def label_uploaded(file, label_col_name, n_classes, seed, progress=gr.Progress()): if file is None: return "No file uploaded.", None, "" progress(0, desc="Reading file...") try: with open(file.name, "r") as f: first_line = f.readline().strip() # Detect JSONL vs CSV try: json.loads(first_line) is_jsonl = True except Exception: is_jsonl = False records = [] with open(file.name, "r") as f: if is_jsonl: for line in f: line = line.strip() if line: records.append(json.loads(line)) else: reader = csv.DictReader(f) records = list(reader) progress(0.5, desc="Assigning labels...") random.seed(seed) for r in records: r[label_col_name] = random.randint(0, int(n_classes) - 1) progress(0.85, desc="Writing output...") content = records_to_jsonl(records) if is_jsonl else records_to_csv(records) ext = "jsonl" if is_jsonl else "csv" filepath = save_to_tmp(content, ext) progress(1.0, desc="Done!") preview = "\n".join(json.dumps(r) for r in records[:5]) status = f"Labelled {len(records):,} records with {n_classes} classes → column '{label_col_name}'" return status, filepath, preview except Exception as e: return f"Error: {e}", None, "" # ── Gradio UI ─────────────────────────────────────────────────────────────── CSS = """ body, .gradio-container { background: #070a12 !important; color: #e8eaf6 !important; } .gradio-container { max-width: 960px !important; margin: 0 auto !important; } h1, h2, h3 { font-family: 'Space Mono', monospace !important; } .gr-button-primary { background: #7c5cfc !important; border-color: #7c5cfc !important; } .gr-button-primary:hover { background: #9b7ffe !important; } footer { display: none !important; } """ with gr.Blocks(title="Nexa Data Studio", css=CSS, theme=gr.themes.Base()) as demo: gr.Markdown(""" # ⬡ Nexa Data Studio **Scientific Dataset Generator** · Aethron Labs Generate synthetic datasets for ML research — regression, classification, time series, molecular, and PDE fields. No payment required. --- """) with gr.Tabs(): # ── TAB 1: Generate ────────────────────────────────────────────── with gr.TabItem("Generate Dataset"): with gr.Row(): with gr.Column(scale=1): dataset_type = gr.Dropdown( ["Regression", "Classification", "Time Series", "Molecular Properties", "PDE Field (2D)"], label="Dataset Type", value="Regression" ) n_samples = gr.Slider(50, 5000, value=500, step=50, label="Number of Samples") output_format = gr.Radio(["JSONL", "CSV"], value="JSONL", label="Output Format") noise_level = gr.Slider(0.0, 2.0, value=0.1, step=0.05, label="Noise Level (σ)") seed = gr.Number(value=42, label="Random Seed", precision=0) with gr.Column(scale=1): with gr.Group() as reg_cls_opts: n_features = gr.Slider(1, 20, value=4, step=1, label="Number of Features") with gr.Group(visible=False) as cls_opts: n_classes = gr.Slider(2, 10, value=3, step=1, label="Number of Classes") with gr.Group(visible=False) as ts_opts: n_series = gr.Slider(1, 20, value=3, step=1, label="Number of Series") with gr.Group(visible=False) as pde_opts: grid_size = gr.Slider(4, 32, value=8, step=2, label="Grid Size (NxN)") def update_opts(dtype): show_feat = dtype in ["Regression", "Classification"] show_cls = dtype == "Classification" show_ts = dtype == "Time Series" show_pde = dtype == "PDE Field (2D)" return ( gr.update(visible=show_feat), gr.update(visible=show_cls), gr.update(visible=show_ts), gr.update(visible=show_pde), ) dataset_type.change(update_opts, dataset_type, [reg_cls_opts, cls_opts, ts_opts, pde_opts]) gen_btn = gr.Button("Generate Dataset", variant="primary") gen_status = gr.Textbox(label="Status", interactive=False) gen_file = gr.File(label="Download Generated Dataset") gen_preview = gr.Code(label="Preview (first 5 records)", language="json", lines=8) gen_btn.click( run_generation, inputs=[dataset_type, n_samples, n_features, n_classes, n_series, grid_size, noise_level, seed, output_format], outputs=[gen_status, gen_file, gen_preview] ) # ── TAB 2: Label Uploaded Data ─────────────────────────────────── with gr.TabItem("Label Uploaded Data"): gr.Markdown("Upload an existing `.jsonl` or `.csv` file and automatically assign random class labels to each record.") with gr.Row(): with gr.Column(): upload_file = gr.File(label="Upload Dataset (.jsonl or .csv)", file_types=[".jsonl", ".csv"]) label_col = gr.Textbox(value="label", label="Label Column Name") label_classes = gr.Slider(2, 20, value=3, step=1, label="Number of Classes") label_seed = gr.Number(value=42, label="Random Seed", precision=0) label_btn = gr.Button("Assign Labels", variant="primary") label_status = gr.Textbox(label="Status", interactive=False) label_file = gr.File(label="Download Labelled Dataset") label_preview = gr.Code(label="Preview (first 5 records)", language="json", lines=8) label_btn.click( label_uploaded, inputs=[upload_file, label_col, label_classes, label_seed], outputs=[label_status, label_file, label_preview] ) # ── TAB 3: About ───────────────────────────────────────────────── with gr.TabItem("About"): gr.Markdown(""" ## Nexa Data Studio Part of the **Nexa Stack** by [Aethron Labs](https://huggingface.co/AethronPhantom) — a Scientific Machine Learning Research Lab. ### Supported Dataset Types | Type | Description | Use Case | |------|-------------|----------| | **Regression** | Continuous target from linear combination of features + noise | Surrogate model training | | **Classification** | Gaussian cluster data with configurable classes | Classifier benchmarking | | **Time Series** | Multi-series sinusoidal signals with noise | Forecasting, anomaly detection | | **Molecular Properties** | Synthetic molecular descriptors (MW, logP, TPSA, HBD/HBA) | Drug discovery ML | | **PDE Field (2D)** | 2D sinusoidal field solutions with noise | Physics-informed neural networks | ### Output Formats - **JSONL** — one JSON object per line, ideal for streaming and LLM fine-tuning pipelines - **CSV** — tabular format for pandas, sklearn, and spreadsheet tools ### Notes All data is synthetically generated — no real molecular structures or physical measurements are included. For research use only. """) demo.launch()