Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| from transformers import AutoTokenizer | |
| from vllm import LLM, SamplingParams | |
| # Load the model and tokenizer from Hugging Face | |
| model_name = "facebook/opt-125m" | |
| tokenizer = AutoTokenizer.from_pretrained(model_name) | |
| # Initialize vLLM with CPU configuration | |
| vllm_model = LLM(model=model_name, tensor_parallel_size=1, device="cpu") | |
| def generate_response(prompt, max_tokens, temperature, top_p): | |
| # Tokenize the prompt | |
| inputs = tokenizer(prompt, return_tensors="pt")["input_ids"].tolist()[0] | |
| # Define sampling parameters | |
| sampling_params = SamplingParams( | |
| max_tokens=max_tokens, | |
| temperature=temperature, | |
| top_p=top_p, | |
| ) | |
| # Generate text using vLLM | |
| output = vllm_model.generate(inputs, sampling_params) | |
| # Decode the generated tokens to text | |
| generated_text = tokenizer.decode(output[0].outputs[0].token_ids, skip_special_tokens=True) | |
| return generated_text | |
| # Gradio UI | |
| with gr.Blocks() as demo: | |
| gr.Markdown("# π Hugging Face Integration with vLLM (CPU)") | |
| gr.Markdown("Generate text using the vLLM integration with Hugging Face models on CPU.") | |
| with gr.Row(): | |
| with gr.Column(): | |
| prompt_input = gr.Textbox( | |
| label="Prompt", | |
| placeholder="Enter your prompt here...", | |
| lines=3, | |
| ) | |
| max_tokens = gr.Slider( | |
| label="Max Tokens", | |
| minimum=10, | |
| maximum=500, | |
| value=100, | |
| step=10, | |
| ) | |
| temperature = gr.Slider( | |
| label="Temperature", | |
| minimum=0.1, | |
| maximum=1.0, | |
| value=0.7, | |
| step=0.1, | |
| ) | |
| top_p = gr.Slider( | |
| label="Top P", | |
| minimum=0.1, | |
| maximum=1.0, | |
| value=0.9, | |
| step=0.1, | |
| ) | |
| submit_button = gr.Button("Generate") | |
| with gr.Column(): | |
| output_text = gr.Textbox( | |
| label="Generated Text", | |
| lines=10, | |
| interactive=False, | |
| ) | |
| submit_button.click( | |
| generate_response, | |
| inputs=[prompt_input, max_tokens, temperature, top_p], | |
| outputs=output_text, | |
| ) | |
| # Launch the app | |
| demo.launch() | |