Spaces:
Running
Running
| """ | |
| Nexa Data Studio β Scientific Dataset Generator | |
| Aethron Labs | No payment required, fully functional synthetic data generation. | |
| """ | |
| import gradio as gr | |
| import json | |
| import csv | |
| import io | |
| import random | |
| import math | |
| import time | |
| import tempfile | |
| import os | |
| from datetime import datetime | |
| # ββ Synthetic data generators ββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _gaussian_noise(n, dim, noise=0.05): | |
| return [[round(random.gauss(0, 1) + random.gauss(0, noise), 4) for _ in range(dim)] for _ in range(n)] | |
| def generate_regression(n_samples, n_features, noise_level, seed): | |
| random.seed(seed) | |
| records = [] | |
| weights = [random.uniform(-2, 2) for _ in range(n_features)] | |
| for i in range(n_samples): | |
| x = [round(random.gauss(0, 1), 4) for _ in range(n_features)] | |
| y = sum(w * xi for w, xi in zip(weights, x)) + random.gauss(0, noise_level) | |
| records.append({f"x{j+1}": x[j] for j in range(n_features)} | {"y": round(y, 4), "sample_id": i}) | |
| return records | |
| def generate_classification(n_samples, n_classes, n_features, noise_level, seed): | |
| random.seed(seed) | |
| records = [] | |
| centers = [[random.uniform(-4, 4) for _ in range(n_features)] for _ in range(n_classes)] | |
| for i in range(n_samples): | |
| cls = random.randint(0, n_classes - 1) | |
| x = [round(centers[cls][j] + random.gauss(0, 1 + noise_level), 4) for j in range(n_features)] | |
| records.append({f"x{j+1}": x[j] for j in range(n_features)} | {"label": cls, "sample_id": i}) | |
| return records | |
| def generate_timeseries(n_samples, n_series, noise_level, seed): | |
| random.seed(seed) | |
| records = [] | |
| for s in range(n_series): | |
| freq = random.uniform(0.05, 0.3) | |
| amp = random.uniform(0.5, 2.0) | |
| phase = random.uniform(0, 2 * math.pi) | |
| for t in range(n_samples): | |
| val = amp * math.sin(2 * math.pi * freq * t + phase) + random.gauss(0, noise_level) | |
| records.append({"series_id": s, "timestep": t, "value": round(val, 4)}) | |
| return records | |
| def generate_molecular(n_samples, seed): | |
| random.seed(seed) | |
| elements = ["C", "H", "O", "N", "S", "P", "F", "Cl"] | |
| records = [] | |
| for i in range(n_samples): | |
| n_atoms = random.randint(5, 20) | |
| formula = "".join( | |
| f"{e}{random.randint(1,6)}" for e in random.sample(elements, random.randint(2, 4)) | |
| ) | |
| mw = round(random.uniform(50, 500), 2) | |
| logp = round(random.gauss(2.0, 1.5), 3) | |
| tpsa = round(random.uniform(20, 150), 2) | |
| hbd = random.randint(0, 5) | |
| hba = random.randint(0, 10) | |
| records.append({ | |
| "sample_id": i, "formula": formula, "n_atoms": n_atoms, | |
| "mol_weight": mw, "logP": logp, "TPSA": tpsa, | |
| "HBD": hbd, "HBA": hba, | |
| "lipinski_pass": int(mw <= 500 and logp <= 5 and hbd <= 5 and hba <= 10) | |
| }) | |
| return records | |
| def generate_pde_field(n_samples, grid_size, noise_level, seed): | |
| random.seed(seed) | |
| records = [] | |
| for i in range(n_samples): | |
| kx = random.uniform(0.5, 3.0) | |
| ky = random.uniform(0.5, 3.0) | |
| for gx in range(grid_size): | |
| for gy in range(grid_size): | |
| x = gx / grid_size | |
| y = gy / grid_size | |
| u = math.sin(kx * math.pi * x) * math.cos(ky * math.pi * y) + random.gauss(0, noise_level) | |
| records.append({"sample_id": i, "x": round(x, 3), "y": round(y, 3), "u": round(u, 4)}) | |
| return records | |
| # ββ File writers ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def records_to_jsonl(records): | |
| return "\n".join(json.dumps(r) for r in records) | |
| def records_to_csv(records): | |
| if not records: | |
| return "" | |
| buf = io.StringIO() | |
| writer = csv.DictWriter(buf, fieldnames=records[0].keys()) | |
| writer.writeheader() | |
| writer.writerows(records) | |
| return buf.getvalue() | |
| def save_to_tmp(content, ext): | |
| tmp = tempfile.NamedTemporaryFile(delete=False, suffix=f".{ext}", mode="w") | |
| tmp.write(content) | |
| tmp.close() | |
| return tmp.name | |
| # ββ Main generation function ββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def run_generation(dataset_type, n_samples, n_features, n_classes, n_series, | |
| grid_size, noise_level, seed, output_format, progress=gr.Progress()): | |
| progress(0, desc="Initialising...") | |
| time.sleep(0.2) | |
| progress(0.2, desc="Generating samples...") | |
| try: | |
| if dataset_type == "Regression": | |
| records = generate_regression(int(n_samples), int(n_features), float(noise_level), int(seed)) | |
| elif dataset_type == "Classification": | |
| records = generate_classification(int(n_samples), int(n_classes), int(n_features), float(noise_level), int(seed)) | |
| elif dataset_type == "Time Series": | |
| records = generate_timeseries(int(n_samples), int(n_series), float(noise_level), int(seed)) | |
| elif dataset_type == "Molecular Properties": | |
| records = generate_molecular(int(n_samples), int(seed)) | |
| elif dataset_type == "PDE Field (2D)": | |
| records = generate_pde_field(int(n_samples), int(grid_size), float(noise_level), int(seed)) | |
| else: | |
| return "Unknown dataset type.", None, "" | |
| progress(0.7, desc="Serialising output...") | |
| time.sleep(0.1) | |
| if output_format == "JSONL": | |
| content = records_to_jsonl(records) | |
| ext = "jsonl" | |
| else: | |
| content = records_to_csv(records) | |
| ext = "csv" | |
| progress(0.9, desc="Writing file...") | |
| filepath = save_to_tmp(content, ext) | |
| progress(1.0, desc="Done!") | |
| preview = "\n".join(json.dumps(r) for r in records[:5]) | |
| status = ( | |
| f"Generated {len(records):,} records Β· {dataset_type} Β· " | |
| f"{output_format} Β· seed={seed} Β· {datetime.utcnow().strftime('%H:%M:%S UTC')}" | |
| ) | |
| return status, filepath, preview | |
| except Exception as e: | |
| return f"Error: {e}", None, "" | |
| # ββ Label uploaded data βββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def label_uploaded(file, label_col_name, n_classes, seed, progress=gr.Progress()): | |
| if file is None: | |
| return "No file uploaded.", None, "" | |
| progress(0, desc="Reading file...") | |
| try: | |
| with open(file.name, "r") as f: | |
| first_line = f.readline().strip() | |
| # Detect JSONL vs CSV | |
| try: | |
| json.loads(first_line) | |
| is_jsonl = True | |
| except Exception: | |
| is_jsonl = False | |
| records = [] | |
| with open(file.name, "r") as f: | |
| if is_jsonl: | |
| for line in f: | |
| line = line.strip() | |
| if line: | |
| records.append(json.loads(line)) | |
| else: | |
| reader = csv.DictReader(f) | |
| records = list(reader) | |
| progress(0.5, desc="Assigning labels...") | |
| random.seed(seed) | |
| for r in records: | |
| r[label_col_name] = random.randint(0, int(n_classes) - 1) | |
| progress(0.85, desc="Writing output...") | |
| content = records_to_jsonl(records) if is_jsonl else records_to_csv(records) | |
| ext = "jsonl" if is_jsonl else "csv" | |
| filepath = save_to_tmp(content, ext) | |
| progress(1.0, desc="Done!") | |
| preview = "\n".join(json.dumps(r) for r in records[:5]) | |
| status = f"Labelled {len(records):,} records with {n_classes} classes β column '{label_col_name}'" | |
| return status, filepath, preview | |
| except Exception as e: | |
| return f"Error: {e}", None, "" | |
| # ββ Gradio UI βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| CSS = """ | |
| body, .gradio-container { background: #070a12 !important; color: #e8eaf6 !important; } | |
| .gradio-container { max-width: 960px !important; margin: 0 auto !important; } | |
| h1, h2, h3 { font-family: 'Space Mono', monospace !important; } | |
| .gr-button-primary { background: #7c5cfc !important; border-color: #7c5cfc !important; } | |
| .gr-button-primary:hover { background: #9b7ffe !important; } | |
| footer { display: none !important; } | |
| """ | |
| with gr.Blocks(title="Nexa Data Studio", css=CSS, theme=gr.themes.Base()) as demo: | |
| gr.Markdown(""" | |
| # ⬑ Nexa Data Studio | |
| **Scientific Dataset Generator** Β· Aethron Labs | |
| Generate synthetic datasets for ML research β regression, classification, time series, molecular, and PDE fields. No payment required. | |
| --- | |
| """) | |
| with gr.Tabs(): | |
| # ββ TAB 1: Generate ββββββββββββββββββββββββββββββββββββββββββββββ | |
| with gr.TabItem("Generate Dataset"): | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| dataset_type = gr.Dropdown( | |
| ["Regression", "Classification", "Time Series", "Molecular Properties", "PDE Field (2D)"], | |
| label="Dataset Type", value="Regression" | |
| ) | |
| n_samples = gr.Slider(50, 5000, value=500, step=50, label="Number of Samples") | |
| output_format = gr.Radio(["JSONL", "CSV"], value="JSONL", label="Output Format") | |
| noise_level = gr.Slider(0.0, 2.0, value=0.1, step=0.05, label="Noise Level (Ο)") | |
| seed = gr.Number(value=42, label="Random Seed", precision=0) | |
| with gr.Column(scale=1): | |
| with gr.Group() as reg_cls_opts: | |
| n_features = gr.Slider(1, 20, value=4, step=1, label="Number of Features") | |
| with gr.Group(visible=False) as cls_opts: | |
| n_classes = gr.Slider(2, 10, value=3, step=1, label="Number of Classes") | |
| with gr.Group(visible=False) as ts_opts: | |
| n_series = gr.Slider(1, 20, value=3, step=1, label="Number of Series") | |
| with gr.Group(visible=False) as pde_opts: | |
| grid_size = gr.Slider(4, 32, value=8, step=2, label="Grid Size (NxN)") | |
| def update_opts(dtype): | |
| show_feat = dtype in ["Regression", "Classification"] | |
| show_cls = dtype == "Classification" | |
| show_ts = dtype == "Time Series" | |
| show_pde = dtype == "PDE Field (2D)" | |
| return ( | |
| gr.update(visible=show_feat), | |
| gr.update(visible=show_cls), | |
| gr.update(visible=show_ts), | |
| gr.update(visible=show_pde), | |
| ) | |
| dataset_type.change(update_opts, dataset_type, [reg_cls_opts, cls_opts, ts_opts, pde_opts]) | |
| gen_btn = gr.Button("Generate Dataset", variant="primary") | |
| gen_status = gr.Textbox(label="Status", interactive=False) | |
| gen_file = gr.File(label="Download Generated Dataset") | |
| gen_preview = gr.Code(label="Preview (first 5 records)", language="json", lines=8) | |
| gen_btn.click( | |
| run_generation, | |
| inputs=[dataset_type, n_samples, n_features, n_classes, n_series, grid_size, noise_level, seed, output_format], | |
| outputs=[gen_status, gen_file, gen_preview] | |
| ) | |
| # ββ TAB 2: Label Uploaded Data βββββββββββββββββββββββββββββββββββ | |
| with gr.TabItem("Label Uploaded Data"): | |
| gr.Markdown("Upload an existing `.jsonl` or `.csv` file and automatically assign random class labels to each record.") | |
| with gr.Row(): | |
| with gr.Column(): | |
| upload_file = gr.File(label="Upload Dataset (.jsonl or .csv)", file_types=[".jsonl", ".csv"]) | |
| label_col = gr.Textbox(value="label", label="Label Column Name") | |
| label_classes = gr.Slider(2, 20, value=3, step=1, label="Number of Classes") | |
| label_seed = gr.Number(value=42, label="Random Seed", precision=0) | |
| label_btn = gr.Button("Assign Labels", variant="primary") | |
| label_status = gr.Textbox(label="Status", interactive=False) | |
| label_file = gr.File(label="Download Labelled Dataset") | |
| label_preview = gr.Code(label="Preview (first 5 records)", language="json", lines=8) | |
| label_btn.click( | |
| label_uploaded, | |
| inputs=[upload_file, label_col, label_classes, label_seed], | |
| outputs=[label_status, label_file, label_preview] | |
| ) | |
| # ββ TAB 3: About βββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with gr.TabItem("About"): | |
| gr.Markdown(""" | |
| ## Nexa Data Studio | |
| Part of the **Nexa Stack** by [Aethron Labs](https://huggingface.co/AethronPhantom) β a Scientific Machine Learning Research Lab. | |
| ### Supported Dataset Types | |
| | Type | Description | Use Case | | |
| |------|-------------|----------| | |
| | **Regression** | Continuous target from linear combination of features + noise | Surrogate model training | | |
| | **Classification** | Gaussian cluster data with configurable classes | Classifier benchmarking | | |
| | **Time Series** | Multi-series sinusoidal signals with noise | Forecasting, anomaly detection | | |
| | **Molecular Properties** | Synthetic molecular descriptors (MW, logP, TPSA, HBD/HBA) | Drug discovery ML | | |
| | **PDE Field (2D)** | 2D sinusoidal field solutions with noise | Physics-informed neural networks | | |
| ### Output Formats | |
| - **JSONL** β one JSON object per line, ideal for streaming and LLM fine-tuning pipelines | |
| - **CSV** β tabular format for pandas, sklearn, and spreadsheet tools | |
| ### Notes | |
| All data is synthetically generated β no real molecular structures or physical measurements are included. | |
| For research use only. | |
| """) | |
| demo.launch() | |