it run colab cpu

#3
by asdgad - opened

from transformers import AutoTokenizer, pipeline
from optimum.onnxruntime import ORTModelForCausalLM
import os

Define the root local path where the full model repository was cloned

This should be the directory that contains the 'onnx' subfolder, config.json, etc.

local_repo_root = "/content/mistral-7b-instruct-v0.2-ONNX"

The specific subfolder within the local_repo_root where the ONNX model files are located

onnx_model_subfolder = "onnx/cpu_and_mobile/mistral-7b-instruct-v0.2-cpu-int4-rtn-block-32-acc-level-4"

The specific ONNX file name

onnx_file_name = "mistral-7b-instruct-v0.2-cpu-int4-rtn-block-32-acc-level-4.onnx"

Ensure the root directory exists

if not os.path.exists(local_repo_root):
print(f"Error: Local repository root not found at {local_repo_root}")
else:
print(f"Loading model and tokenizer from local repository: {local_repo_root}")

Manually download the tokenizer.model file to the local_repo_root if it's not already there

tokenizer_model_path = os.path.join(local_repo_root, "tokenizer.model")
if not os.path.exists(tokenizer_model_path):
print(f"Downloading missing tokenizer.model to {local_repo_root}...")
os.system(f"wget -O {tokenizer_model_path} https://huggingface.co/microsoft/mistral-7b-instruct-v0.2-ONNX/resolve/main/tokenizer.model")
print("tokenizer.model downloaded.")

Load the ONNX model from the local repository, pointing to the subfolder

model = ORTModelForCausalLM.from_pretrained(
local_repo_root, # Point to the root of the cloned repository
subfolder=onnx_model_subfolder, # Specify the subfolder for the ONNX model
file_name=onnx_file_name,
use_io_binding=True,
local_files_only=True # Crucial: tells the library to only look locally
)

Load the Tokenizer from the local repository root

tokenizer = AutoTokenizer.from_pretrained(
local_repo_root, # Point to the root of the cloned repository for tokenizer files
local_files_only=True # Crucial: tells the library to only look locally
)

Create pipeline

pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

Test the model

result = pipe(
"Who is Napoleon Bonaparte?",
max_new_tokens=30,
do_sample=True,
temperature=0.7,
top_k=50,
top_p=0.9,
num_return_sequences=1
)

print(result)

Sign up or log in to comment