π Book Reference: This article is based on Chapter 5: Custom Applications of Practical RHEL AI, covering Ansible playbooks and CI/CD automation for enterprise AI workflows.
Manual AI deployments donβt scale. Whether youβre managing 3 servers or 300, Ansible provides the automation backbone for consistent, repeatable RHEL AI deployments. Practical RHEL AI includes ready-to-use playbooks that Iβll share in this article.
| Approach | Time to Deploy | Consistency | Rollback Capability |
|---|---|---|---|
| Manual | Hours | Variable | Manual |
| Scripts | 30 minutes | Better | Complex |
| Ansible | 5 minutes | 100% | Built-in |
rhel-ai-automation/
βββ inventory/
β βββ production.yml
β βββ staging.yml
βββ group_vars/
β βββ all.yml
β βββ gpu_nodes.yml
βββ roles/
β βββ rhel_ai_base/
β βββ gpu_setup/
β βββ vllm_server/
β βββ instructlab/
β βββ monitoring/
βββ playbooks/
β βββ site.yml
β βββ deploy_model.yml
β βββ update_model.yml
βββ ansible.cfg# inventory/production.yml
all:
children:
gpu_nodes:
hosts:
ai-node-01:
ansible_host: 192.168.1.101
gpu_type: nvidia_a100
gpu_count: 4
ai-node-02:
ansible_host: 192.168.1.102
gpu_type: nvidia_h100
gpu_count: 8
training_nodes:
hosts:
train-01:
ansible_host: 192.168.1.110
deepspeed_config: multi_gpu
inference_nodes:
hosts:
infer-01:
ansible_host: 192.168.1.120
infer-02:
ansible_host: 192.168.1.121# roles/rhel_ai_base/tasks/main.yml
---
- name: Enable RHEL AI repository
ansible.builtin.dnf:
name: "rhel-ai-release"
state: present
become: true
- name: Install RHEL AI core packages
ansible.builtin.dnf:
name:
- rhel-ai
- python3.11
- podman
- nvidia-container-toolkit
state: present
become: true
- name: Configure container runtime for GPU
ansible.builtin.template:
src: nvidia-container-runtime.toml.j2
dest: /etc/nvidia-container-runtime/config.toml
mode: '0644'
become: true
notify: Restart podman
- name: Set SELinux context for model directory
community.general.sefcontext:
target: '/opt/models(/.*)?'
setype: container_file_t
state: present
become: true
- name: Create model storage directory
ansible.builtin.file:
path: /opt/models
state: directory
mode: '0755'
owner: "{{ ai_user }}"
group: "{{ ai_group }}"
become: true# roles/gpu_setup/tasks/nvidia.yml
---
- name: Install NVIDIA drivers
ansible.builtin.dnf:
name:
- nvidia-driver
- nvidia-driver-cuda
- cuda-toolkit-12-4
state: present
become: true
when: gpu_type is match("nvidia.*")
- name: Verify GPU detection
ansible.builtin.command: nvidia-smi --query-gpu=name,memory.total --format=csv
register: gpu_info
changed_when: false
- name: Display GPU information
ansible.builtin.debug:
msg: "Detected GPUs: {{ gpu_info.stdout_lines }}"
- name: Configure GPU memory settings
ansible.builtin.template:
src: gpu-settings.conf.j2
dest: /etc/modprobe.d/nvidia.conf
mode: '0644'
become: true
notify: Reload nvidia modules# roles/vllm_server/tasks/main.yml
---
- name: Pull vLLM container image
containers.podman.podman_image:
name: "{{ vllm_image }}"
tag: "{{ vllm_version }}"
become: true
- name: Create vLLM configuration
ansible.builtin.template:
src: vllm-config.yaml.j2
dest: /opt/vllm/config.yaml
mode: '0644'
become: true
notify: Restart vllm service
- name: Deploy vLLM systemd service
ansible.builtin.template:
src: vllm.service.j2
dest: /etc/systemd/system/vllm.service
mode: '0644'
become: true
notify:
- Reload systemd
- Restart vllm service
- name: Ensure vLLM service is running
ansible.builtin.systemd:
name: vllm
state: started
enabled: true
become: true
- name: Wait for vLLM health check
ansible.builtin.uri:
url: "http://localhost:{{ vllm_port }}/health"
method: GET
status_code: 200
register: health_check
until: health_check.status == 200
retries: 30
delay: 10# roles/instructlab/tasks/main.yml
---
- name: Install InstructLab CLI
ansible.builtin.pip:
name: instructlab
state: present
virtualenv: /opt/instructlab/venv
become: true
- name: Clone taxonomy repository
ansible.builtin.git:
repo: "{{ taxonomy_repo }}"
dest: /opt/instructlab/taxonomy
version: "{{ taxonomy_version }}"
become: true
become_user: "{{ ai_user }}"
- name: Configure InstructLab
ansible.builtin.template:
src: config.yaml.j2
dest: /opt/instructlab/config.yaml
mode: '0644'
become: true
- name: Generate synthetic training data
ansible.builtin.command:
cmd: >
/opt/instructlab/venv/bin/ilab data generate
--config /opt/instructlab/config.yaml
--num-instructions {{ synthetic_data_count }}
args:
chdir: /opt/instructlab
become: true
become_user: "{{ ai_user }}"
when: generate_synthetic_data | default(false)
async: 3600
poll: 60# playbooks/site.yml
---
- name: Deploy RHEL AI Infrastructure
hosts: all
become: true
pre_tasks:
- name: Verify RHEL version
ansible.builtin.assert:
that:
- ansible_distribution == "RedHat"
- ansible_distribution_major_version | int >= 9
fail_msg: "RHEL 9+ required for RHEL AI"
roles:
- role: rhel_ai_base
tags: [base, always]
- name: Configure GPU Nodes
hosts: gpu_nodes
become: true
roles:
- role: gpu_setup
tags: [gpu]
- name: Deploy Inference Servers
hosts: inference_nodes
become: true
roles:
- role: vllm_server
tags: [inference, vllm]
- name: Deploy Training Environment
hosts: training_nodes
become: true
roles:
- role: instructlab
tags: [training, instructlab]
- name: Setup Monitoring
hosts: all
become: true
roles:
- role: monitoring
tags: [monitoring, prometheus]# .gitlab-ci.yml
stages:
- validate
- deploy-staging
- test
- deploy-production
variables:
ANSIBLE_CONFIG: ./ansible.cfg
validate:
stage: validate
image: registry.redhat.io/ansible-automation-platform/ee-supported-rhel9:latest
script:
- ansible-lint playbooks/
- ansible-playbook playbooks/site.yml --syntax-check
rules:
- changes:
- "**/*.yml"
- "**/*.yaml"
deploy-staging:
stage: deploy-staging
image: registry.redhat.io/ansible-automation-platform/ee-supported-rhel9:latest
script:
- ansible-playbook -i inventory/staging.yml playbooks/site.yml
environment:
name: staging
rules:
- if: $CI_COMMIT_BRANCH == "main"
model-validation:
stage: test
script:
- python tests/validate_model.py --endpoint $STAGING_ENDPOINT
- python tests/benchmark_latency.py --target-p95 80
needs: [deploy-staging]
deploy-production:
stage: deploy-production
image: registry.redhat.io/ansible-automation-platform/ee-supported-rhel9:latest
script:
- ansible-playbook -i inventory/production.yml playbooks/site.yml
environment:
name: production
when: manual
needs: [model-validation]# playbooks/update_model.yml
---
- name: Update AI Model with Zero Downtime
hosts: inference_nodes
serial: 1 # Rolling update
become: true
vars:
new_model_version: "{{ model_version | default('latest') }}"
tasks:
- name: Remove node from load balancer
ansible.builtin.uri:
url: "{{ lb_api }}/nodes/{{ inventory_hostname }}/disable"
method: POST
delegate_to: localhost
- name: Wait for active requests to complete
ansible.builtin.wait_for:
timeout: 60
- name: Pull new model version
containers.podman.podman_image:
name: "{{ model_registry }}/{{ model_name }}"
tag: "{{ new_model_version }}"
- name: Update vLLM configuration
ansible.builtin.lineinfile:
path: /opt/vllm/config.yaml
regexp: '^model:'
line: "model: {{ model_registry }}/{{ model_name }}:{{ new_model_version }}"
notify: Restart vllm service
- name: Flush handlers
ansible.builtin.meta: flush_handlers
- name: Verify model health
ansible.builtin.uri:
url: "http://localhost:{{ vllm_port }}/health"
status_code: 200
register: health
until: health.status == 200
retries: 30
delay: 5
- name: Re-enable node in load balancer
ansible.builtin.uri:
url: "{{ lb_api }}/nodes/{{ inventory_hostname }}/enable"
method: POST
delegate_to: localhostThis article covers material from:
Want the complete Ansible collection for RHEL AI?
Practical RHEL AI includes a downloadable playbook repository with:
Practical RHEL AI gives you everything you need to deploy and manage enterprise AI at scale with Ansible.
Learn More βBuy on Amazon β