Ansible Error Handling: ignore_errors, failed_when, changed_when

Ansible มีกลไก error handling ที่ยืดหยุ่น ช่วยให้ Playbook ทำงานต่อได้แม้บาง task จะล้มเหลว หรือกำหนดว่าเมื่อไรจึงถือว่า task สำเร็จหรือมีการเปลี่ยนแปลง — ignore_errors, failed_when และ changed_when เป็น directives หลักที่ใช้ควบคุมพฤติกรรมนี้

บทความนี้ครอบคลุมการใช้ ignore_errors เพื่อ skip errors ที่ไม่สำคัญ, failed_when เพื่อกำหนดเงื่อนไข failure เอง, changed_when เพื่อควบคุม changed status, การใช้ block กับ rescue สำหรับ try-catch pattern และ any_errors_fatal สำหรับ fail-fast

ignore_errors — ข้าม Error และทำงานต่อ

ignore_errors: true บอก Ansible ให้ทำ task ถัดไปต่อแม้ task นี้จะ fail — ผลที่ fail ยังถูกเก็บใน registered variable ทำให้ตรวจสอบและ handle ได้ในภายหลัง

---
- name: ignore_errors examples
  hosts: all
  tasks:
    # ลองหยุด service ที่อาจไม่มีอยู่
    - name: Stop legacy service (may not exist)
      service:
        name: legacy-daemon
        state: stopped
      ignore_errors: true

    # ตรวจสอบสถานะโดยไม่ fail ถ้า service ไม่ทำงาน
    - name: Check if application is running
      command: systemctl is-active myapp
      register: app_status
      ignore_errors: true
      changed_when: false

    - name: Show application status
      debug:
        msg: "myapp is {{ 'running' if app_status.rc == 0 else 'not running' }}"

    # รัน script ที่อาจ fail และตรวจสอบผลลัพธ์
    - name: Run optional cleanup script
      command: /opt/myapp/bin/cleanup.sh
      register: cleanup_result
      ignore_errors: true

    - name: Report cleanup failure if it occurred
      debug:
        msg: "Cleanup failed: {{ cleanup_result.stderr }}"
      when: cleanup_result.failed

failed_when — กำหนดเงื่อนไข Failure เอง

failed_when ใช้กำหนดว่า task ควร fail เมื่อใด โดยพิจารณาจาก output, return code หรือ condition อื่น ๆ แทนที่จะใช้ค่า default ของ Ansible — เหมาะสำหรับ command ที่ return code ไม่ตรงกับ convention ปกติ

---
- name: failed_when examples
  hosts: all
  tasks:
    # fail เฉพาะเมื่อ output มีคำว่า "ERROR"
    - name: Run health check
      command: /opt/myapp/bin/health-check.sh
      register: health_result
      failed_when: '"ERROR" in health_result.stdout'
      changed_when: false

    # fail เมื่อ return code ไม่ใช่ 0 หรือ 1 (บาง script ใช้ 1 = warning ไม่ใช่ error)
    - name: Run validation script
      command: /opt/scripts/validate.sh
      register: validate_result
      failed_when: validate_result.rc not in [0, 1]

    # fail เมื่อ disk ใช้เกิน 90%
    - name: Check disk usage
      command: df -h /
      register: disk_usage
      changed_when: false
      failed_when: false   # ไม่ fail เอง — ตรวจเองด้านล่าง

    - name: Assert disk usage is acceptable
      fail:
        msg: "Disk usage critical — check server immediately"
      when: '"9" in disk_usage.stdout.split()[-2]'   # % usage column

    # รวม conditions
    - name: Complex failure condition
      command: /opt/deploy/run.sh
      register: deploy_result
      failed_when:
        - deploy_result.rc != 0
        - '"rollback" not in deploy_result.stdout'

changed_when — ควบคุม Changed Status

changed_when กำหนดว่า task จะถือว่า “changed” เมื่อใด — ค่า false ทำให้ task แสดงเป็น ok เสมอ (เหมาะสำหรับ read-only commands), ส่วน expression ช่วยให้ตัดสินจาก output ได้

---
- name: changed_when examples
  hosts: all
  tasks:
    # ไม่ mark changed สำหรับ read-only command
    - name: Get system information
      command: uname -a
      register: sys_info
      changed_when: false    # แสดงเป็น "ok" ไม่ใช่ "changed"

    # mark changed เฉพาะเมื่อ script ทำการเปลี่ยนแปลงจริง
    - name: Run database migration
      command: /opt/myapp/bin/migrate.sh
      register: migrate_result
      changed_when: '"Applied" in migrate_result.stdout'

    # mark changed จาก return code แบบกำหนดเอง
    # (script return 0=no change, 1=changed, 2=error)
    - name: Run idempotent configuration script
      command: /opt/scripts/configure.sh
      register: config_result
      changed_when: config_result.rc == 1
      failed_when: config_result.rc == 2

    # ใช้ร่วมกัน: read-only check ที่ไม่ทำให้ Playbook ดู changed
    - name: Verify SSL certificate expiry
      command: "openssl x509 -noout -enddate -in /etc/ssl/certs/mysite.crt"
      register: cert_check
      changed_when: false
      failed_when: false

block / rescue / always — Try-Catch Pattern

block ร่วมกับ rescue และ always ให้ error handling แบบ try-catch — tasks ใน rescue จะรันเมื่อมี task ใน block fail, ส่วน always รันทุกกรณีไม่ว่า block จะ success หรือ fail

---
- name: block/rescue/always example
  hosts: all
  become: true
  tasks:
    - name: Deploy application with rollback
      block:
        # --- Try ---
        - name: Stop current application
          service:
            name: myapp
            state: stopped

        - name: Deploy new version
          git:
            repo: "{{ app_repo }}"
            dest: /opt/myapp/current
            version: "{{ deploy_version }}"
            force: true

        - name: Start application
          service:
            name: myapp
            state: started

        - name: Verify application health
          uri:
            url: http://localhost:8080/health
            status_code: 200
          retries: 5
          delay: 3

      rescue:
        # --- Catch (เมื่อ block ล้มเหลว) ---
        - name: Rollback to previous version
          git:
            repo: "{{ app_repo }}"
            dest: /opt/myapp/current
            version: "{{ previous_version }}"
            force: true

        - name: Restart with previous version
          service:
            name: myapp
            state: restarted

        - name: Notify team about rollback
          debug:
            msg: "Deployment failed — rolled back to {{ previous_version }}"

      always:
        # --- Finally (รันเสมอ) ---
        - name: Cleanup temporary deploy files
          file:
            path: /tmp/deploy-staging
            state: absent

        - name: Record deployment attempt
          lineinfile:
            path: /var/log/deployments.log
            line: "{{ ansible_date_time.iso8601 }} deploy={{ deploy_version }} status={{ 'success' if not ansible_failed_task else 'failed' }}"
            create: true

any_errors_fatal — Fail Fast บน Error แรก

any_errors_fatal: true ที่ระดับ Play จะหยุด Playbook ทันทีเมื่อ task ใด task หนึ่งล้มเหลวบน host ใดก็ตาม — ตรงข้ามกับ default ที่ Ansible จะข้าม host ที่ fail และทำต่อบน host อื่น

---
- name: Critical deployment with fail-fast
  hosts: appservers
  any_errors_fatal: true    # หยุดทุก host ทันทีเมื่อมี error ใน host ใดก็ตาม
  become: true

  tasks:
    - name: Pre-deployment checks
      command: /opt/myapp/bin/preflight-check.sh
      changed_when: false
      # ถ้า check ล้มเหลวบน host ใดก็ตาม → หยุดทันที

    - name: Deploy application
      git:
        repo: "{{ app_repo }}"
        dest: /opt/myapp/current
        version: "{{ deploy_version }}"

    - name: Restart service
      service:
        name: myapp
        state: restarted

---
# ใช้ max_fail_percentage แทน any_errors_fatal เพื่อยืดหยุ่นกว่า
- name: Rolling deployment with error threshold
  hosts: appservers
  serial: 2
  max_fail_percentage: 25    # fail ถ้า host มากกว่า 25% ล้มเหลว
  tasks:
    - name: Deploy and restart
      service:
        name: myapp
        state: restarted

Pattern: Deployment with Comprehensive Error Handling

ตัวอย่าง Playbook deploy application พร้อม error handling ครบถ้วน — ใช้ block/rescue/always สำหรับ rollback อัตโนมัติ, failed_when สำหรับ health check และ ignore_errors สำหรับ cleanup tasks

---
- name: Production deployment with error handling
  hosts: appservers
  become: true
  vars:
    app_name: myapp
    deploy_version: "{{ target_version }}"

  tasks:
    - name: Full deployment with rollback support
      block:
        # ตรวจสอบ prerequisites
        - name: Check disk space
          command: df -BG /opt
          register: df_output
          changed_when: false
          failed_when: df_output.stdout.split()[10].rstrip('G') | int < 2

        # backup current version
        - name: Backup current deployment
          copy:
            src: "/opt/{{ app_name }}/current/"
            dest: "/opt/{{ app_name }}/backup/"
            remote_src: true
          ignore_errors: true   # ไม่ fail ถ้ายังไม่มี current version

        # deploy new version
        - name: Deploy application
          git:
            repo: "{{ app_repo }}"
            dest: "/opt/{{ app_name }}/current"
            version: "{{ deploy_version }}"
            force: true

        # run migrations
        - name: Run database migrations
          command: "/opt/{{ app_name }}/current/bin/migrate"
          register: migrate_result
          changed_when: '"Applied" in migrate_result.stdout'

        # restart service
        - name: Restart application
          service:
            name: "{{ app_name }}"
            state: restarted

        # verify health
        - name: Verify application is healthy
          uri:
            url: "http://localhost:8080/health"
            status_code: 200
          register: health_check
          retries: 5
          delay: 5
          failed_when: health_check.status != 200

      rescue:
        - name: Restore from backup
          copy:
            src: "/opt/{{ app_name }}/backup/"
            dest: "/opt/{{ app_name }}/current/"
            remote_src: true

        - name: Restart with backup version
          service:
            name: "{{ app_name }}"
            state: restarted

        - name: Fail with informative message
          fail:
            msg: "Deployment of {{ deploy_version }} failed — restored backup version"

      always:
        - name: Remove staging files
          file:
            path: "/tmp/{{ app_name }}-deploy"
            state: absent
          ignore_errors: true

สรุป

Ansible มี error handling หลายระดับ: ignore_errors สำหรับข้าม error ที่ไม่สำคัญ, failed_when สำหรับกำหนด failure condition เอง, changed_when: false สำหรับ read-only tasks และ block/rescue/always สำหรับ try-catch pattern ที่ซับซ้อน

Pattern ที่ควรจำ: ใช้ changed_when: false กับทุก command ที่แค่ query ข้อมูล, ใช้ failed_when แทน ignore_errors เมื่อต้องการตัดสินจาก output, ใช้ block/rescue เพื่อ rollback อัตโนมัติใน deployment และใช้ any_errors_fatal ในงาน critical ที่ต้องการ consistency บนทุก host