Allanatrix's picture
Upload app.py with huggingface_hub
7917ec8 verified
"""
Nexa Data Studio β€” Scientific Dataset Generator
Aethron Labs | No payment required, fully functional synthetic data generation.
"""
import gradio as gr
import json
import csv
import io
import random
import math
import time
import tempfile
import os
from datetime import datetime
# ── Synthetic data generators ──────────────────────────────────────────────
def _gaussian_noise(n, dim, noise=0.05):
return [[round(random.gauss(0, 1) + random.gauss(0, noise), 4) for _ in range(dim)] for _ in range(n)]
def generate_regression(n_samples, n_features, noise_level, seed):
random.seed(seed)
records = []
weights = [random.uniform(-2, 2) for _ in range(n_features)]
for i in range(n_samples):
x = [round(random.gauss(0, 1), 4) for _ in range(n_features)]
y = sum(w * xi for w, xi in zip(weights, x)) + random.gauss(0, noise_level)
records.append({f"x{j+1}": x[j] for j in range(n_features)} | {"y": round(y, 4), "sample_id": i})
return records
def generate_classification(n_samples, n_classes, n_features, noise_level, seed):
random.seed(seed)
records = []
centers = [[random.uniform(-4, 4) for _ in range(n_features)] for _ in range(n_classes)]
for i in range(n_samples):
cls = random.randint(0, n_classes - 1)
x = [round(centers[cls][j] + random.gauss(0, 1 + noise_level), 4) for j in range(n_features)]
records.append({f"x{j+1}": x[j] for j in range(n_features)} | {"label": cls, "sample_id": i})
return records
def generate_timeseries(n_samples, n_series, noise_level, seed):
random.seed(seed)
records = []
for s in range(n_series):
freq = random.uniform(0.05, 0.3)
amp = random.uniform(0.5, 2.0)
phase = random.uniform(0, 2 * math.pi)
for t in range(n_samples):
val = amp * math.sin(2 * math.pi * freq * t + phase) + random.gauss(0, noise_level)
records.append({"series_id": s, "timestep": t, "value": round(val, 4)})
return records
def generate_molecular(n_samples, seed):
random.seed(seed)
elements = ["C", "H", "O", "N", "S", "P", "F", "Cl"]
records = []
for i in range(n_samples):
n_atoms = random.randint(5, 20)
formula = "".join(
f"{e}{random.randint(1,6)}" for e in random.sample(elements, random.randint(2, 4))
)
mw = round(random.uniform(50, 500), 2)
logp = round(random.gauss(2.0, 1.5), 3)
tpsa = round(random.uniform(20, 150), 2)
hbd = random.randint(0, 5)
hba = random.randint(0, 10)
records.append({
"sample_id": i, "formula": formula, "n_atoms": n_atoms,
"mol_weight": mw, "logP": logp, "TPSA": tpsa,
"HBD": hbd, "HBA": hba,
"lipinski_pass": int(mw <= 500 and logp <= 5 and hbd <= 5 and hba <= 10)
})
return records
def generate_pde_field(n_samples, grid_size, noise_level, seed):
random.seed(seed)
records = []
for i in range(n_samples):
kx = random.uniform(0.5, 3.0)
ky = random.uniform(0.5, 3.0)
for gx in range(grid_size):
for gy in range(grid_size):
x = gx / grid_size
y = gy / grid_size
u = math.sin(kx * math.pi * x) * math.cos(ky * math.pi * y) + random.gauss(0, noise_level)
records.append({"sample_id": i, "x": round(x, 3), "y": round(y, 3), "u": round(u, 4)})
return records
# ── File writers ────────────────────────────────────────────────────────────
def records_to_jsonl(records):
return "\n".join(json.dumps(r) for r in records)
def records_to_csv(records):
if not records:
return ""
buf = io.StringIO()
writer = csv.DictWriter(buf, fieldnames=records[0].keys())
writer.writeheader()
writer.writerows(records)
return buf.getvalue()
def save_to_tmp(content, ext):
tmp = tempfile.NamedTemporaryFile(delete=False, suffix=f".{ext}", mode="w")
tmp.write(content)
tmp.close()
return tmp.name
# ── Main generation function ────────────────────────────────────────────────
def run_generation(dataset_type, n_samples, n_features, n_classes, n_series,
grid_size, noise_level, seed, output_format, progress=gr.Progress()):
progress(0, desc="Initialising...")
time.sleep(0.2)
progress(0.2, desc="Generating samples...")
try:
if dataset_type == "Regression":
records = generate_regression(int(n_samples), int(n_features), float(noise_level), int(seed))
elif dataset_type == "Classification":
records = generate_classification(int(n_samples), int(n_classes), int(n_features), float(noise_level), int(seed))
elif dataset_type == "Time Series":
records = generate_timeseries(int(n_samples), int(n_series), float(noise_level), int(seed))
elif dataset_type == "Molecular Properties":
records = generate_molecular(int(n_samples), int(seed))
elif dataset_type == "PDE Field (2D)":
records = generate_pde_field(int(n_samples), int(grid_size), float(noise_level), int(seed))
else:
return "Unknown dataset type.", None, ""
progress(0.7, desc="Serialising output...")
time.sleep(0.1)
if output_format == "JSONL":
content = records_to_jsonl(records)
ext = "jsonl"
else:
content = records_to_csv(records)
ext = "csv"
progress(0.9, desc="Writing file...")
filepath = save_to_tmp(content, ext)
progress(1.0, desc="Done!")
preview = "\n".join(json.dumps(r) for r in records[:5])
status = (
f"Generated {len(records):,} records Β· {dataset_type} Β· "
f"{output_format} Β· seed={seed} Β· {datetime.utcnow().strftime('%H:%M:%S UTC')}"
)
return status, filepath, preview
except Exception as e:
return f"Error: {e}", None, ""
# ── Label uploaded data ─────────────────────────────────────────────────────
def label_uploaded(file, label_col_name, n_classes, seed, progress=gr.Progress()):
if file is None:
return "No file uploaded.", None, ""
progress(0, desc="Reading file...")
try:
with open(file.name, "r") as f:
first_line = f.readline().strip()
# Detect JSONL vs CSV
try:
json.loads(first_line)
is_jsonl = True
except Exception:
is_jsonl = False
records = []
with open(file.name, "r") as f:
if is_jsonl:
for line in f:
line = line.strip()
if line:
records.append(json.loads(line))
else:
reader = csv.DictReader(f)
records = list(reader)
progress(0.5, desc="Assigning labels...")
random.seed(seed)
for r in records:
r[label_col_name] = random.randint(0, int(n_classes) - 1)
progress(0.85, desc="Writing output...")
content = records_to_jsonl(records) if is_jsonl else records_to_csv(records)
ext = "jsonl" if is_jsonl else "csv"
filepath = save_to_tmp(content, ext)
progress(1.0, desc="Done!")
preview = "\n".join(json.dumps(r) for r in records[:5])
status = f"Labelled {len(records):,} records with {n_classes} classes β†’ column '{label_col_name}'"
return status, filepath, preview
except Exception as e:
return f"Error: {e}", None, ""
# ── Gradio UI ───────────────────────────────────────────────────────────────
CSS = """
body, .gradio-container { background: #070a12 !important; color: #e8eaf6 !important; }
.gradio-container { max-width: 960px !important; margin: 0 auto !important; }
h1, h2, h3 { font-family: 'Space Mono', monospace !important; }
.gr-button-primary { background: #7c5cfc !important; border-color: #7c5cfc !important; }
.gr-button-primary:hover { background: #9b7ffe !important; }
footer { display: none !important; }
"""
with gr.Blocks(title="Nexa Data Studio", css=CSS, theme=gr.themes.Base()) as demo:
gr.Markdown("""
# ⬑ Nexa Data Studio
**Scientific Dataset Generator** Β· Aethron Labs
Generate synthetic datasets for ML research β€” regression, classification, time series, molecular, and PDE fields. No payment required.
---
""")
with gr.Tabs():
# ── TAB 1: Generate ──────────────────────────────────────────────
with gr.TabItem("Generate Dataset"):
with gr.Row():
with gr.Column(scale=1):
dataset_type = gr.Dropdown(
["Regression", "Classification", "Time Series", "Molecular Properties", "PDE Field (2D)"],
label="Dataset Type", value="Regression"
)
n_samples = gr.Slider(50, 5000, value=500, step=50, label="Number of Samples")
output_format = gr.Radio(["JSONL", "CSV"], value="JSONL", label="Output Format")
noise_level = gr.Slider(0.0, 2.0, value=0.1, step=0.05, label="Noise Level (Οƒ)")
seed = gr.Number(value=42, label="Random Seed", precision=0)
with gr.Column(scale=1):
with gr.Group() as reg_cls_opts:
n_features = gr.Slider(1, 20, value=4, step=1, label="Number of Features")
with gr.Group(visible=False) as cls_opts:
n_classes = gr.Slider(2, 10, value=3, step=1, label="Number of Classes")
with gr.Group(visible=False) as ts_opts:
n_series = gr.Slider(1, 20, value=3, step=1, label="Number of Series")
with gr.Group(visible=False) as pde_opts:
grid_size = gr.Slider(4, 32, value=8, step=2, label="Grid Size (NxN)")
def update_opts(dtype):
show_feat = dtype in ["Regression", "Classification"]
show_cls = dtype == "Classification"
show_ts = dtype == "Time Series"
show_pde = dtype == "PDE Field (2D)"
return (
gr.update(visible=show_feat),
gr.update(visible=show_cls),
gr.update(visible=show_ts),
gr.update(visible=show_pde),
)
dataset_type.change(update_opts, dataset_type, [reg_cls_opts, cls_opts, ts_opts, pde_opts])
gen_btn = gr.Button("Generate Dataset", variant="primary")
gen_status = gr.Textbox(label="Status", interactive=False)
gen_file = gr.File(label="Download Generated Dataset")
gen_preview = gr.Code(label="Preview (first 5 records)", language="json", lines=8)
gen_btn.click(
run_generation,
inputs=[dataset_type, n_samples, n_features, n_classes, n_series, grid_size, noise_level, seed, output_format],
outputs=[gen_status, gen_file, gen_preview]
)
# ── TAB 2: Label Uploaded Data ───────────────────────────────────
with gr.TabItem("Label Uploaded Data"):
gr.Markdown("Upload an existing `.jsonl` or `.csv` file and automatically assign random class labels to each record.")
with gr.Row():
with gr.Column():
upload_file = gr.File(label="Upload Dataset (.jsonl or .csv)", file_types=[".jsonl", ".csv"])
label_col = gr.Textbox(value="label", label="Label Column Name")
label_classes = gr.Slider(2, 20, value=3, step=1, label="Number of Classes")
label_seed = gr.Number(value=42, label="Random Seed", precision=0)
label_btn = gr.Button("Assign Labels", variant="primary")
label_status = gr.Textbox(label="Status", interactive=False)
label_file = gr.File(label="Download Labelled Dataset")
label_preview = gr.Code(label="Preview (first 5 records)", language="json", lines=8)
label_btn.click(
label_uploaded,
inputs=[upload_file, label_col, label_classes, label_seed],
outputs=[label_status, label_file, label_preview]
)
# ── TAB 3: About ─────────────────────────────────────────────────
with gr.TabItem("About"):
gr.Markdown("""
## Nexa Data Studio
Part of the **Nexa Stack** by [Aethron Labs](https://huggingface.co/AethronPhantom) β€” a Scientific Machine Learning Research Lab.
### Supported Dataset Types
| Type | Description | Use Case |
|------|-------------|----------|
| **Regression** | Continuous target from linear combination of features + noise | Surrogate model training |
| **Classification** | Gaussian cluster data with configurable classes | Classifier benchmarking |
| **Time Series** | Multi-series sinusoidal signals with noise | Forecasting, anomaly detection |
| **Molecular Properties** | Synthetic molecular descriptors (MW, logP, TPSA, HBD/HBA) | Drug discovery ML |
| **PDE Field (2D)** | 2D sinusoidal field solutions with noise | Physics-informed neural networks |
### Output Formats
- **JSONL** β€” one JSON object per line, ideal for streaming and LLM fine-tuning pipelines
- **CSV** β€” tabular format for pandas, sklearn, and spreadsheet tools
### Notes
All data is synthetically generated β€” no real molecular structures or physical measurements are included.
For research use only.
""")
demo.launch()