Hugging Face Inference API
Recommended Models
-
mistralai/Mistral-7B-Instruct-v0.1 - Best quality
-
google/flan-t5-xxl - Good instruction following
-
microsoft/DialoGPT-large - Conversational AI
Setup Steps
- 1. Get HF API token from settings
- 2. Install requests library
- 3. Implement API calls
- 4. Handle responses
Python Implementation
import requests
class HuggingFaceAPI:
def __init__(self, api_key):
self.api_key = api_key
self.base_url = "https://api-inference.huggingface.co/models"
def query_model(self, model_name, prompt, max_length=500):
headers = {"Authorization": f"Bearer {self.api_key}"}
payload = {
"inputs": prompt,
"parameters": {
"max_length": max_length,
"temperature": 0.7,
"do_sample": True
}
}
try:
response = requests.post(
f"{self.base_url}/{model_name}",
headers=headers,
json=payload
)
result = response.json()
return result[0]["generated_text"]
except Exception as e:
return f"Error: {str(e)}"
# Usage
hf_api = HuggingFaceAPI("your_hf_api_key_here")
response = hf_api.query_model(
"mistralai/Mistral-7B-Instruct-v0.1",
"Generate a Python function to calculate factorial"
)
print(response)
Gradio Integration Example
import gradio as gr
def chat_interface(message, history):
hf_api = HuggingFaceAPI("your_hf_api_key")
response = hf_api.query_model("mistralai/Mistral-7B-Instruct", message)
return response
iface = gr.ChatInterface(
chat_interface,
title="PromptCraft AI Assistant",
description="Powered by Hugging Face models"
)
iface.launch()
Available Models
-
gpt-4 - Most capable
-
gpt-3.5-turbo - Fast & cost-effective
-
gpt-4-turbo - Latest features
Pricing (Approx)
- GPT-4: $0.03/1K tokens input
- GPT-3.5: $0.0015/1K tokens input
- Free tier: $18 credit for 3 months
OpenAI Implementation
import openai
from typing import List, Dict
class OpenAIClient:
def __init__(self, api_key: str, model: str = "gpt-3.5-turbo"):
self.client = openai.OpenAI(api_key=api_key)
self.model = model
def chat_completion(self, messages: List[Dict], temperature: float = 0.7) -> str:
try:
response = self.client.chat.completions.create(
model=self.model,
messages=messages,
temperature=temperature,
max_tokens=1000
)
return response.choices[0].message.content
except Exception as e:
return f"Error: {str(e)}"
def single_message(self, prompt: str, system_message: str = None) -> str:
messages = []
if system_message:
messages.append({"role": "system", "content": system_message})
messages.append({"role": "user", "content": prompt})
return self.chat_completion(messages)
# Usage
openai_client = OpenAIClient("your_openai_api_key")
system_prompt = "You are a helpful AI assistant that provides direct answers."
response = openai_client.single_message(
"Generate a Python function to calculate factorial",
system_prompt
)
print(response)
Advanced Features
# Streaming responses
def stream_chat(messages):
stream = client.chat.completions.create(
model="gpt-4",
messages=messages,
stream=True
)
for chunk in stream:
if chunk.choices[0].delta.content is not None:
yield chunk.choices[0].delta.content
# Function calling
functions = [{
"name": "calculate_factorial",
"description": "Calculate factorial of a number",
"parameters": {
"type": "object",
"properties": {
"number": {"type": "integer", "description": "The number to calculate factorial for"}
},
"required": ["number"]
}
}]
Lightweight Models
-
Microsoft/DialoGPT-small - 334MB
-
distilgpt2 - 534MB
-
facebook/blenderbot-400M-distill - 1.5GB
Requirements
- • Minimum 2GB RAM for small models
- • Python 3.8+ with transformers library
- • Internet for initial model download
- • GPU optional but recommended
Local Model Implementation
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
import torch
class LocalAIModel:
def __init__(self, model_name: str = "microsoft/DialoGPT-small"):
self.model_name = model_name
self.tokenizer = None
self.model = None
self.chat_pipeline = None
def load_model(self):
"""Load the model and tokenizer"""
try:
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
self.model = AutoModelForCausalLM.from_pretrained(self.model_name)
# Create chat pipeline
self.chat_pipeline