44 lines
1.6 KiB
Python
44 lines
1.6 KiB
Python
from dotenv import load_dotenv
|
|
|
|
load_dotenv()
|
|
from unsloth import FastLanguageModel, FastModel
|
|
from transformers import TextStreamer
|
|
|
|
model_name = r"E:\code\python\fine-tuning\gemma-3-270m-finetuned"
|
|
model, tokenizer = FastModel.from_pretrained(
|
|
model_name=model_name,
|
|
max_seq_length=512,
|
|
dtype=None,
|
|
load_in_4bit=False,
|
|
full_finetuning=False)
|
|
model = FastLanguageModel.get_peft_model(model,
|
|
r=128,
|
|
target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj",
|
|
"up_proj", "down_proj"],
|
|
use_gradient_checkpointing="unsloth",
|
|
lora_alpha=128,
|
|
lora_dropout=0,
|
|
bias="none",
|
|
random_state=3407,
|
|
use_rslor=False,
|
|
loftq_config=None)
|
|
FastLanguageModel.for_inference(model)
|
|
|
|
def chat(model, message):
|
|
outputs = model.generate(
|
|
**tokenizer([message], return_tensors="pt").to("cuda"),
|
|
max_new_tokens=1024,
|
|
streamer= TextStreamer(tokenizer, skip_prompt=True),
|
|
temperature=1.0, top_p=0.95, top_k=64,
|
|
)
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
while True:
|
|
message = input("user:")
|
|
if message == "exit":
|
|
break
|
|
chat(model, f"<start_of_turn>user\n{message}\n<end_of_turn>\n<start_of_turn>model\n")
|
|
|