Finetuning a Reasoning LLM with Unsloth and Serving with vLLM
In this tutorial, we learn how to create a workflow to finetune a reasoning Qwen3 large language model with Unsloth and serve it on Union. Unsloth makes finetuning LLMs faster and use less memory without degradation in accuracy. Union workflow declarative infrastructure makes it easy to specific your computing resources for finetuning. Furthermore, we can use Union Serving to serve the finetuned model with ~10 lines of code.
Once you have a Union account, install union
:
pip install union
Export the following environment variable to build and push images to your own container registry:
# replace with your registry name
export IMAGE_SPEC_REGISTRY="<your-container-registry>"
Then run the following commands to run the workflow:
$ git clone https://github.com/unionai/unionai-examples
$ cd unionai-examples
$ union run --remote <path/to/file.py> <workflow_name> <params>
The source code for this example can be found here.
Defining Workflow Dependencies
First, we import the modules needed by our workflow:
from union import ImageSpec, FlyteDirectory, task, Resources
from flytekit.extras.accelerators import L4
from flytekit import Cache
from typing import Annotated
from union import Artifact, workflow
from flytekit.extras.accelerators import GPUAccelerator
Next, we define a ImageSpec
that contains the python dependencies for the finetuning
task:
image = ImageSpec(
name="unsloth-finetune",
apt_packages=["build-essential"],
packages=[
"torch==2.7.0",
"huggingface-hub[hf_transfer]==0.31.1",
"pandas==2.2.3",
"union",
],
registry="ghcr.io/unionai-oss",
env={"HF_HUB_ENABLE_HF_TRANSFER": "1"},
commands=["uv pip install unsloth==2025.4.7"],
)
Note that we set HF_HUB_ENABLE_HF_TRANSFER=1
to use the faster rust-based downloader
from HuggingFace. build-essential
is required to use PyTorch compile to optimize the
model for training.
Finetuning Workflow
Next we define two artifacts:
qwen-tuned
: The output of Unsloth’s finetuning taskvllm-qwen-model
: Convert the Unsloth model into a format that works with VLLM
TUNED_MODEL = Artifact(name="qwen-tuned")
SAVED_VLLM_MODEL = Artifact(name="vllm-qwen-model")
We define the finetuning by declaring it’s resources such as a L4
GPU and ephemeral storage storage
used to hold the dataset.
@task(
container_image=image,
requests=Resources(mem="23Gi", gpu="1", ephemeral_storage="20Gi", cpu="6"),
accelerator=L4,
cache=Cache(version="v1"),
)
def finetune() -> Annotated[FlyteDirectory, TUNED_MODEL]:
"""Finetune model with Unsloth."""
from unsloth import FastLanguageModel
from datasets import load_dataset
from unsloth.chat_templates import standardize_sharegpt
import pandas as pd
from datasets import Dataset
from trl import SFTTrainer, SFTConfig
model, tokenizer = FastLanguageModel.from_pretrained(
model_name="unsloth/Qwen3-1.7B-unsloth-bnb-4bit",
max_seq_length=2048,
load_in_4bit=True,
load_in_8bit=False,
full_finetuning=False,
)
model = FastLanguageModel.get_peft_model(
model,
r=32,
target_modules=[
"q_proj",
"k_proj",
"v_proj",
"o_proj",
"gate_proj",
"up_proj",
"down_proj",
],
lora_alpha=32,
lora_dropout=0,
bias="none",
use_gradient_checkpointing="unsloth",
random_state=3407,
use_rslora=False,
loftq_config=None,
)
reasoning_dataset = load_dataset("unsloth/OpenMathReasoning-mini", split="cot")
non_reasoning_dataset = load_dataset("mlabonne/FineTome-100k", split="train")
def generate_conversation(examples):
problems = examples["problem"]
solutions = examples["generated_solution"]
conversations = []
for problem, solution in zip(problems, solutions):
conversations.append(
[
{"role": "user", "content": problem},
{"role": "assistant", "content": solution},
]
)
return {
"conversations": conversations,
}
reasoning_conversations = tokenizer.apply_chat_template(
reasoning_dataset.map(generate_conversation, batched=True)["conversations"],
tokenize=False,
)
dataset = standardize_sharegpt(non_reasoning_dataset)
non_reasoning_conversations = tokenizer.apply_chat_template(
dataset["conversations"],
tokenize=False,
)
chat_percentage = 0.75
non_reasoning_subset = pd.Series(non_reasoning_conversations)
non_reasoning_subset = non_reasoning_subset.sample(
int(len(reasoning_conversations) * (1.0 - chat_percentage)),
random_state=2407,
)
data = pd.concat([pd.Series(reasoning_conversations), pd.Series(non_reasoning_subset)])
data.name = "text"
combined_dataset = Dataset.from_pandas(pd.DataFrame(data))
combined_dataset = combined_dataset.shuffle(seed=3407)
trainer = SFTTrainer(
model=model,
tokenizer=tokenizer,
train_dataset=combined_dataset,
eval_dataset=None,
args=SFTConfig(
dataset_text_field="text",
per_device_train_batch_size=2,
gradient_accumulation_steps=4,
warmup_steps=5,
max_steps=1,
learning_rate=2e-4,
logging_steps=1,
optim="adamw_8bit",
weight_decay=0.01,
lr_scheduler_type="linear",
seed=3407,
report_to="none",
),
)
trainer.train()
lora_path = FlyteDirectory.new("lora_model")
model.save_pretrained(lora_path.path)
tokenizer.save_pretrained(lora_path.path)
return lora_path
Unsloth can quantize a finetuned model into different formats that are more suitable for serving, such as GGUF. In this next task, we convert the model into a 16 bit model so that VLLM can easily serve it.
@task(
container_image=image,
requests=Resources(mem="23Gi", gpu="1", ephemeral_storage="20Gi", cpu="6"),
accelerator=L4,
cache=Cache(version="v1"),
)
def convert_vllm(tuned_model: FlyteDirectory) -> Annotated[FlyteDirectory, SAVED_VLLM_MODEL]:
"""Convert model for VLLM to consume."""
from unsloth import FastLanguageModel
tuned_model.download()
model, tokenizer = FastLanguageModel.from_pretrained(
model_name=tuned_model.path,
max_seq_length=2048,
load_in_4bit=True,
)
vllm_model_dir = FlyteDirectory.new("lora_model")
model.save_pretrained_merged(
vllm_model_dir,
tokenizer,
save_method="merged_16bit",
)
return vllm_model_dir
We define a simple workflow that takes the finetuned model and converts it into a format for VLLM.
@workflow
def unsloth_finetune():
tuned_model = finetune()
convert_vllm(tuned_model=tuned_model)
Defining the VLLM App
Finally, we configure the VLLMApp to serve the finetuned model. The model
is set
to the artifact returned by the convert_vllm
task.
from union.app.llm import VLLMApp
app = VLLMApp(
name="unsloth-qwen-tuned",
container_image="ghcr.io/unionai-oss/serving-vllm:0.1.17",
requests=Resources(mem="23Gi", gpu="1", ephemeral_storage="20Gi", cpu="6"),
model=SAVED_VLLM_MODEL.query(),
model_id="unsloth-qwen",
stream_model=True,
accelerator=GPUAccelerator("nvidia-l40s"),
)
To run the finetune workflow with Unsloth:
union run --remote unsloth_finetune.py unsloth_finetune
To deploy the VLLM Serving App on Union:
union deploy apps unsloth_finetune.py unsloth-qwen-tuned