it run colab cpu
from transformers import AutoTokenizer, pipeline
from optimum.onnxruntime import ORTModelForCausalLM
import os
Define the root local path where the full model repository was cloned
This should be the directory that contains the 'onnx' subfolder, config.json, etc.
local_repo_root = "/content/mistral-7b-instruct-v0.2-ONNX"
The specific subfolder within the local_repo_root where the ONNX model files are located
onnx_model_subfolder = "onnx/cpu_and_mobile/mistral-7b-instruct-v0.2-cpu-int4-rtn-block-32-acc-level-4"
The specific ONNX file name
onnx_file_name = "mistral-7b-instruct-v0.2-cpu-int4-rtn-block-32-acc-level-4.onnx"
Ensure the root directory exists
if not os.path.exists(local_repo_root):
print(f"Error: Local repository root not found at {local_repo_root}")
else:
print(f"Loading model and tokenizer from local repository: {local_repo_root}")
Manually download the tokenizer.model file to the local_repo_root if it's not already there
tokenizer_model_path = os.path.join(local_repo_root, "tokenizer.model")
if not os.path.exists(tokenizer_model_path):
print(f"Downloading missing tokenizer.model to {local_repo_root}...")
os.system(f"wget -O {tokenizer_model_path} https://huggingface.co/microsoft/mistral-7b-instruct-v0.2-ONNX/resolve/main/tokenizer.model")
print("tokenizer.model downloaded.")
Load the ONNX model from the local repository, pointing to the subfolder
model = ORTModelForCausalLM.from_pretrained(
local_repo_root, # Point to the root of the cloned repository
subfolder=onnx_model_subfolder, # Specify the subfolder for the ONNX model
file_name=onnx_file_name,
use_io_binding=True,
local_files_only=True # Crucial: tells the library to only look locally
)
Load the Tokenizer from the local repository root
tokenizer = AutoTokenizer.from_pretrained(
local_repo_root, # Point to the root of the cloned repository for tokenizer files
local_files_only=True # Crucial: tells the library to only look locally
)
Create pipeline
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
Test the model
result = pipe(
"Who is Napoleon Bonaparte?",
max_new_tokens=30,
do_sample=True,
temperature=0.7,
top_k=50,
top_p=0.9,
num_return_sequences=1
)
print(result)