/* This file is autogenerated by u_format_table.py from u_format.csv. Do not edit directly. */

/**************************************************************************
 *
 * Copyright 2010 VMware, Inc.
 * All Rights Reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the
 * "Software"), to deal in the Software without restriction, including
 * without limitation the rights to use, copy, modify, merge, publish,
 * distribute, sub license, and/or sell copies of the Software, and to
 * permit persons to whom the Software is furnished to do so, subject to
 * the following conditions:
 *
 * The above copyright notice and this permission notice (including the
 * next paragraph) shall be included in all copies or substantial portions
 * of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 *
 **************************************************************************/

#include "u_format.h"
#include "u_format_bptc.h"
#include "u_format_s3tc.h"
#include "u_format_rgtc.h"
#include "u_format_latc.h"
#include "u_format_etc.h"


#include "pipe/p_compiler.h"
#include "util/u_math.h"
#include "util/u_half.h"
#include "u_format.h"
#include "u_format_other.h"
#include "util/format_srgb.h"
#include "u_format_yuv.h"
#include "u_format_zs.h"

static inline void
util_format_none_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = *(const uint8_t *)src;
         uint8_t r;
         r = value;
         dst[0] = (float)r; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
         src += 1;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_none_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = 0;
         value |= (uint8_t)CLAMP(src[0], 0.0f, 255.0f);
         *(uint8_t *)dst = value;
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_none_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
         uint8_t value = *(const uint8_t *)src;
         uint8_t r;
         r = value;
         dst[0] = (float)r; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
}

static inline void
util_format_none_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = *(const uint8_t *)src;
         uint8_t r;
         r = value;
         dst[0] = (uint8_t)(((uint32_t)MIN2(r, 1)) * 0xff / 0x1); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 255; /* a */
         src += 1;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_none_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = 0;
         value |= (uint8_t)(((uint32_t)src[0]) * 0x1 / 0xff);
         *(uint8_t *)dst = value;
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_b8g8r8a8_unorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t b;
         uint32_t g;
         uint32_t r;
         uint32_t a;
         b = value >> 24;
         g = (value >> 16) & 0xff;
         r = (value >> 8) & 0xff;
         a = (value) & 0xff;
         dst[0] = ubyte_to_float(r); /* r */
         dst[1] = ubyte_to_float(g); /* g */
         dst[2] = ubyte_to_float(b); /* b */
         dst[3] = ubyte_to_float(a); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t b;
         uint32_t g;
         uint32_t r;
         uint32_t a;
         b = (value) & 0xff;
         g = (value >> 8) & 0xff;
         r = (value >> 16) & 0xff;
         a = value >> 24;
         dst[0] = ubyte_to_float(r); /* r */
         dst[1] = ubyte_to_float(g); /* g */
         dst[2] = ubyte_to_float(b); /* b */
         dst[3] = ubyte_to_float(a); /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_b8g8r8a8_unorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)(float_to_ubyte(src[2])) << 24;
         value |= (uint32_t)((float_to_ubyte(src[1])) & 0xff) << 16;
         value |= (uint32_t)((float_to_ubyte(src[0])) & 0xff) << 8;
         value |= (float_to_ubyte(src[3])) & 0xff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (float_to_ubyte(src[2])) & 0xff;
         value |= (uint32_t)((float_to_ubyte(src[1])) & 0xff) << 8;
         value |= (uint32_t)((float_to_ubyte(src[0])) & 0xff) << 16;
         value |= (uint32_t)(float_to_ubyte(src[3])) << 24;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_b8g8r8a8_unorm_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t b;
         uint32_t g;
         uint32_t r;
         uint32_t a;
         b = value >> 24;
         g = (value >> 16) & 0xff;
         r = (value >> 8) & 0xff;
         a = (value) & 0xff;
         dst[0] = ubyte_to_float(r); /* r */
         dst[1] = ubyte_to_float(g); /* g */
         dst[2] = ubyte_to_float(b); /* b */
         dst[3] = ubyte_to_float(a); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t b;
         uint32_t g;
         uint32_t r;
         uint32_t a;
         b = (value) & 0xff;
         g = (value >> 8) & 0xff;
         r = (value >> 16) & 0xff;
         a = value >> 24;
         dst[0] = ubyte_to_float(r); /* r */
         dst[1] = ubyte_to_float(g); /* g */
         dst[2] = ubyte_to_float(b); /* b */
         dst[3] = ubyte_to_float(a); /* a */
#endif
}

static inline void
util_format_b8g8r8a8_unorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t b;
         uint32_t g;
         uint32_t r;
         uint32_t a;
         b = value >> 24;
         g = (value >> 16) & 0xff;
         r = (value >> 8) & 0xff;
         a = (value) & 0xff;
         dst[0] = r; /* r */
         dst[1] = g; /* g */
         dst[2] = b; /* b */
         dst[3] = a; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t b;
         uint32_t g;
         uint32_t r;
         uint32_t a;
         b = (value) & 0xff;
         g = (value >> 8) & 0xff;
         r = (value >> 16) & 0xff;
         a = value >> 24;
         dst[0] = r; /* r */
         dst[1] = g; /* g */
         dst[2] = b; /* b */
         dst[3] = a; /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_b8g8r8a8_unorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)(src[2]) << 24;
         value |= (uint32_t)((src[1]) & 0xff) << 16;
         value |= (uint32_t)((src[0]) & 0xff) << 8;
         value |= (src[3]) & 0xff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (src[2]) & 0xff;
         value |= (uint32_t)((src[1]) & 0xff) << 8;
         value |= (uint32_t)((src[0]) & 0xff) << 16;
         value |= (uint32_t)(src[3]) << 24;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_b8g8r8x8_unorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t b;
         uint32_t g;
         uint32_t r;
         b = value >> 24;
         g = (value >> 16) & 0xff;
         r = (value >> 8) & 0xff;
         dst[0] = ubyte_to_float(r); /* r */
         dst[1] = ubyte_to_float(g); /* g */
         dst[2] = ubyte_to_float(b); /* b */
         dst[3] = 1; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t b;
         uint32_t g;
         uint32_t r;
         b = (value) & 0xff;
         g = (value >> 8) & 0xff;
         r = (value >> 16) & 0xff;
         dst[0] = ubyte_to_float(r); /* r */
         dst[1] = ubyte_to_float(g); /* g */
         dst[2] = ubyte_to_float(b); /* b */
         dst[3] = 1; /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_b8g8r8x8_unorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)(float_to_ubyte(src[2])) << 24;
         value |= (uint32_t)((float_to_ubyte(src[1])) & 0xff) << 16;
         value |= (uint32_t)((float_to_ubyte(src[0])) & 0xff) << 8;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (float_to_ubyte(src[2])) & 0xff;
         value |= (uint32_t)((float_to_ubyte(src[1])) & 0xff) << 8;
         value |= (uint32_t)((float_to_ubyte(src[0])) & 0xff) << 16;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_b8g8r8x8_unorm_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t b;
         uint32_t g;
         uint32_t r;
         b = value >> 24;
         g = (value >> 16) & 0xff;
         r = (value >> 8) & 0xff;
         dst[0] = ubyte_to_float(r); /* r */
         dst[1] = ubyte_to_float(g); /* g */
         dst[2] = ubyte_to_float(b); /* b */
         dst[3] = 1; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t b;
         uint32_t g;
         uint32_t r;
         b = (value) & 0xff;
         g = (value >> 8) & 0xff;
         r = (value >> 16) & 0xff;
         dst[0] = ubyte_to_float(r); /* r */
         dst[1] = ubyte_to_float(g); /* g */
         dst[2] = ubyte_to_float(b); /* b */
         dst[3] = 1; /* a */
#endif
}

static inline void
util_format_b8g8r8x8_unorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t b;
         uint32_t g;
         uint32_t r;
         b = value >> 24;
         g = (value >> 16) & 0xff;
         r = (value >> 8) & 0xff;
         dst[0] = r; /* r */
         dst[1] = g; /* g */
         dst[2] = b; /* b */
         dst[3] = 255; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t b;
         uint32_t g;
         uint32_t r;
         b = (value) & 0xff;
         g = (value >> 8) & 0xff;
         r = (value >> 16) & 0xff;
         dst[0] = r; /* r */
         dst[1] = g; /* g */
         dst[2] = b; /* b */
         dst[3] = 255; /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_b8g8r8x8_unorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)(src[2]) << 24;
         value |= (uint32_t)((src[1]) & 0xff) << 16;
         value |= (uint32_t)((src[0]) & 0xff) << 8;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (src[2]) & 0xff;
         value |= (uint32_t)((src[1]) & 0xff) << 8;
         value |= (uint32_t)((src[0]) & 0xff) << 16;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_a8r8g8b8_unorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t a;
         uint32_t r;
         uint32_t g;
         uint32_t b;
         a = value >> 24;
         r = (value >> 16) & 0xff;
         g = (value >> 8) & 0xff;
         b = (value) & 0xff;
         dst[0] = ubyte_to_float(r); /* r */
         dst[1] = ubyte_to_float(g); /* g */
         dst[2] = ubyte_to_float(b); /* b */
         dst[3] = ubyte_to_float(a); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t a;
         uint32_t r;
         uint32_t g;
         uint32_t b;
         a = (value) & 0xff;
         r = (value >> 8) & 0xff;
         g = (value >> 16) & 0xff;
         b = value >> 24;
         dst[0] = ubyte_to_float(r); /* r */
         dst[1] = ubyte_to_float(g); /* g */
         dst[2] = ubyte_to_float(b); /* b */
         dst[3] = ubyte_to_float(a); /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_a8r8g8b8_unorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)(float_to_ubyte(src[3])) << 24;
         value |= (uint32_t)((float_to_ubyte(src[0])) & 0xff) << 16;
         value |= (uint32_t)((float_to_ubyte(src[1])) & 0xff) << 8;
         value |= (float_to_ubyte(src[2])) & 0xff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (float_to_ubyte(src[3])) & 0xff;
         value |= (uint32_t)((float_to_ubyte(src[0])) & 0xff) << 8;
         value |= (uint32_t)((float_to_ubyte(src[1])) & 0xff) << 16;
         value |= (uint32_t)(float_to_ubyte(src[2])) << 24;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_a8r8g8b8_unorm_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t a;
         uint32_t r;
         uint32_t g;
         uint32_t b;
         a = value >> 24;
         r = (value >> 16) & 0xff;
         g = (value >> 8) & 0xff;
         b = (value) & 0xff;
         dst[0] = ubyte_to_float(r); /* r */
         dst[1] = ubyte_to_float(g); /* g */
         dst[2] = ubyte_to_float(b); /* b */
         dst[3] = ubyte_to_float(a); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t a;
         uint32_t r;
         uint32_t g;
         uint32_t b;
         a = (value) & 0xff;
         r = (value >> 8) & 0xff;
         g = (value >> 16) & 0xff;
         b = value >> 24;
         dst[0] = ubyte_to_float(r); /* r */
         dst[1] = ubyte_to_float(g); /* g */
         dst[2] = ubyte_to_float(b); /* b */
         dst[3] = ubyte_to_float(a); /* a */
#endif
}

static inline void
util_format_a8r8g8b8_unorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t a;
         uint32_t r;
         uint32_t g;
         uint32_t b;
         a = value >> 24;
         r = (value >> 16) & 0xff;
         g = (value >> 8) & 0xff;
         b = (value) & 0xff;
         dst[0] = r; /* r */
         dst[1] = g; /* g */
         dst[2] = b; /* b */
         dst[3] = a; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t a;
         uint32_t r;
         uint32_t g;
         uint32_t b;
         a = (value) & 0xff;
         r = (value >> 8) & 0xff;
         g = (value >> 16) & 0xff;
         b = value >> 24;
         dst[0] = r; /* r */
         dst[1] = g; /* g */
         dst[2] = b; /* b */
         dst[3] = a; /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_a8r8g8b8_unorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)(src[3]) << 24;
         value |= (uint32_t)((src[0]) & 0xff) << 16;
         value |= (uint32_t)((src[1]) & 0xff) << 8;
         value |= (src[2]) & 0xff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (src[3]) & 0xff;
         value |= (uint32_t)((src[0]) & 0xff) << 8;
         value |= (uint32_t)((src[1]) & 0xff) << 16;
         value |= (uint32_t)(src[2]) << 24;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_x8r8g8b8_unorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t r;
         uint32_t g;
         uint32_t b;
         r = (value >> 16) & 0xff;
         g = (value >> 8) & 0xff;
         b = (value) & 0xff;
         dst[0] = ubyte_to_float(r); /* r */
         dst[1] = ubyte_to_float(g); /* g */
         dst[2] = ubyte_to_float(b); /* b */
         dst[3] = 1; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t r;
         uint32_t g;
         uint32_t b;
         r = (value >> 8) & 0xff;
         g = (value >> 16) & 0xff;
         b = value >> 24;
         dst[0] = ubyte_to_float(r); /* r */
         dst[1] = ubyte_to_float(g); /* g */
         dst[2] = ubyte_to_float(b); /* b */
         dst[3] = 1; /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_x8r8g8b8_unorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((float_to_ubyte(src[0])) & 0xff) << 16;
         value |= (uint32_t)((float_to_ubyte(src[1])) & 0xff) << 8;
         value |= (float_to_ubyte(src[2])) & 0xff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (uint32_t)((float_to_ubyte(src[0])) & 0xff) << 8;
         value |= (uint32_t)((float_to_ubyte(src[1])) & 0xff) << 16;
         value |= (uint32_t)(float_to_ubyte(src[2])) << 24;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_x8r8g8b8_unorm_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t r;
         uint32_t g;
         uint32_t b;
         r = (value >> 16) & 0xff;
         g = (value >> 8) & 0xff;
         b = (value) & 0xff;
         dst[0] = ubyte_to_float(r); /* r */
         dst[1] = ubyte_to_float(g); /* g */
         dst[2] = ubyte_to_float(b); /* b */
         dst[3] = 1; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t r;
         uint32_t g;
         uint32_t b;
         r = (value >> 8) & 0xff;
         g = (value >> 16) & 0xff;
         b = value >> 24;
         dst[0] = ubyte_to_float(r); /* r */
         dst[1] = ubyte_to_float(g); /* g */
         dst[2] = ubyte_to_float(b); /* b */
         dst[3] = 1; /* a */
#endif
}

static inline void
util_format_x8r8g8b8_unorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t r;
         uint32_t g;
         uint32_t b;
         r = (value >> 16) & 0xff;
         g = (value >> 8) & 0xff;
         b = (value) & 0xff;
         dst[0] = r; /* r */
         dst[1] = g; /* g */
         dst[2] = b; /* b */
         dst[3] = 255; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t r;
         uint32_t g;
         uint32_t b;
         r = (value >> 8) & 0xff;
         g = (value >> 16) & 0xff;
         b = value >> 24;
         dst[0] = r; /* r */
         dst[1] = g; /* g */
         dst[2] = b; /* b */
         dst[3] = 255; /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_x8r8g8b8_unorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((src[0]) & 0xff) << 16;
         value |= (uint32_t)((src[1]) & 0xff) << 8;
         value |= (src[2]) & 0xff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (uint32_t)((src[0]) & 0xff) << 8;
         value |= (uint32_t)((src[1]) & 0xff) << 16;
         value |= (uint32_t)(src[2]) << 24;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_a8b8g8r8_unorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t a;
         uint32_t b;
         uint32_t g;
         uint32_t r;
         a = value >> 24;
         b = (value >> 16) & 0xff;
         g = (value >> 8) & 0xff;
         r = (value) & 0xff;
         dst[0] = ubyte_to_float(r); /* r */
         dst[1] = ubyte_to_float(g); /* g */
         dst[2] = ubyte_to_float(b); /* b */
         dst[3] = ubyte_to_float(a); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t a;
         uint32_t b;
         uint32_t g;
         uint32_t r;
         a = (value) & 0xff;
         b = (value >> 8) & 0xff;
         g = (value >> 16) & 0xff;
         r = value >> 24;
         dst[0] = ubyte_to_float(r); /* r */
         dst[1] = ubyte_to_float(g); /* g */
         dst[2] = ubyte_to_float(b); /* b */
         dst[3] = ubyte_to_float(a); /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_a8b8g8r8_unorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)(float_to_ubyte(src[3])) << 24;
         value |= (uint32_t)((float_to_ubyte(src[2])) & 0xff) << 16;
         value |= (uint32_t)((float_to_ubyte(src[1])) & 0xff) << 8;
         value |= (float_to_ubyte(src[0])) & 0xff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (float_to_ubyte(src[3])) & 0xff;
         value |= (uint32_t)((float_to_ubyte(src[2])) & 0xff) << 8;
         value |= (uint32_t)((float_to_ubyte(src[1])) & 0xff) << 16;
         value |= (uint32_t)(float_to_ubyte(src[0])) << 24;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_a8b8g8r8_unorm_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t a;
         uint32_t b;
         uint32_t g;
         uint32_t r;
         a = value >> 24;
         b = (value >> 16) & 0xff;
         g = (value >> 8) & 0xff;
         r = (value) & 0xff;
         dst[0] = ubyte_to_float(r); /* r */
         dst[1] = ubyte_to_float(g); /* g */
         dst[2] = ubyte_to_float(b); /* b */
         dst[3] = ubyte_to_float(a); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t a;
         uint32_t b;
         uint32_t g;
         uint32_t r;
         a = (value) & 0xff;
         b = (value >> 8) & 0xff;
         g = (value >> 16) & 0xff;
         r = value >> 24;
         dst[0] = ubyte_to_float(r); /* r */
         dst[1] = ubyte_to_float(g); /* g */
         dst[2] = ubyte_to_float(b); /* b */
         dst[3] = ubyte_to_float(a); /* a */
#endif
}

static inline void
util_format_a8b8g8r8_unorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t a;
         uint32_t b;
         uint32_t g;
         uint32_t r;
         a = value >> 24;
         b = (value >> 16) & 0xff;
         g = (value >> 8) & 0xff;
         r = (value) & 0xff;
         dst[0] = r; /* r */
         dst[1] = g; /* g */
         dst[2] = b; /* b */
         dst[3] = a; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t a;
         uint32_t b;
         uint32_t g;
         uint32_t r;
         a = (value) & 0xff;
         b = (value >> 8) & 0xff;
         g = (value >> 16) & 0xff;
         r = value >> 24;
         dst[0] = r; /* r */
         dst[1] = g; /* g */
         dst[2] = b; /* b */
         dst[3] = a; /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_a8b8g8r8_unorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)(src[3]) << 24;
         value |= (uint32_t)((src[2]) & 0xff) << 16;
         value |= (uint32_t)((src[1]) & 0xff) << 8;
         value |= (src[0]) & 0xff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (src[3]) & 0xff;
         value |= (uint32_t)((src[2]) & 0xff) << 8;
         value |= (uint32_t)((src[1]) & 0xff) << 16;
         value |= (uint32_t)(src[0]) << 24;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_x8b8g8r8_unorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t b;
         uint32_t g;
         uint32_t r;
         b = (value >> 16) & 0xff;
         g = (value >> 8) & 0xff;
         r = (value) & 0xff;
         dst[0] = ubyte_to_float(r); /* r */
         dst[1] = ubyte_to_float(g); /* g */
         dst[2] = ubyte_to_float(b); /* b */
         dst[3] = 1; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t b;
         uint32_t g;
         uint32_t r;
         b = (value >> 8) & 0xff;
         g = (value >> 16) & 0xff;
         r = value >> 24;
         dst[0] = ubyte_to_float(r); /* r */
         dst[1] = ubyte_to_float(g); /* g */
         dst[2] = ubyte_to_float(b); /* b */
         dst[3] = 1; /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_x8b8g8r8_unorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((float_to_ubyte(src[2])) & 0xff) << 16;
         value |= (uint32_t)((float_to_ubyte(src[1])) & 0xff) << 8;
         value |= (float_to_ubyte(src[0])) & 0xff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (uint32_t)((float_to_ubyte(src[2])) & 0xff) << 8;
         value |= (uint32_t)((float_to_ubyte(src[1])) & 0xff) << 16;
         value |= (uint32_t)(float_to_ubyte(src[0])) << 24;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_x8b8g8r8_unorm_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t b;
         uint32_t g;
         uint32_t r;
         b = (value >> 16) & 0xff;
         g = (value >> 8) & 0xff;
         r = (value) & 0xff;
         dst[0] = ubyte_to_float(r); /* r */
         dst[1] = ubyte_to_float(g); /* g */
         dst[2] = ubyte_to_float(b); /* b */
         dst[3] = 1; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t b;
         uint32_t g;
         uint32_t r;
         b = (value >> 8) & 0xff;
         g = (value >> 16) & 0xff;
         r = value >> 24;
         dst[0] = ubyte_to_float(r); /* r */
         dst[1] = ubyte_to_float(g); /* g */
         dst[2] = ubyte_to_float(b); /* b */
         dst[3] = 1; /* a */
#endif
}

static inline void
util_format_x8b8g8r8_unorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t b;
         uint32_t g;
         uint32_t r;
         b = (value >> 16) & 0xff;
         g = (value >> 8) & 0xff;
         r = (value) & 0xff;
         dst[0] = r; /* r */
         dst[1] = g; /* g */
         dst[2] = b; /* b */
         dst[3] = 255; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t b;
         uint32_t g;
         uint32_t r;
         b = (value >> 8) & 0xff;
         g = (value >> 16) & 0xff;
         r = value >> 24;
         dst[0] = r; /* r */
         dst[1] = g; /* g */
         dst[2] = b; /* b */
         dst[3] = 255; /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_x8b8g8r8_unorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((src[2]) & 0xff) << 16;
         value |= (uint32_t)((src[1]) & 0xff) << 8;
         value |= (src[0]) & 0xff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (uint32_t)((src[2]) & 0xff) << 8;
         value |= (uint32_t)((src[1]) & 0xff) << 16;
         value |= (uint32_t)(src[0]) << 24;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r8g8b8x8_unorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t r;
         uint32_t g;
         uint32_t b;
         r = value >> 24;
         g = (value >> 16) & 0xff;
         b = (value >> 8) & 0xff;
         dst[0] = ubyte_to_float(r); /* r */
         dst[1] = ubyte_to_float(g); /* g */
         dst[2] = ubyte_to_float(b); /* b */
         dst[3] = 1; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t r;
         uint32_t g;
         uint32_t b;
         r = (value) & 0xff;
         g = (value >> 8) & 0xff;
         b = (value >> 16) & 0xff;
         dst[0] = ubyte_to_float(r); /* r */
         dst[1] = ubyte_to_float(g); /* g */
         dst[2] = ubyte_to_float(b); /* b */
         dst[3] = 1; /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r8g8b8x8_unorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)(float_to_ubyte(src[0])) << 24;
         value |= (uint32_t)((float_to_ubyte(src[1])) & 0xff) << 16;
         value |= (uint32_t)((float_to_ubyte(src[2])) & 0xff) << 8;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (float_to_ubyte(src[0])) & 0xff;
         value |= (uint32_t)((float_to_ubyte(src[1])) & 0xff) << 8;
         value |= (uint32_t)((float_to_ubyte(src[2])) & 0xff) << 16;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r8g8b8x8_unorm_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t r;
         uint32_t g;
         uint32_t b;
         r = value >> 24;
         g = (value >> 16) & 0xff;
         b = (value >> 8) & 0xff;
         dst[0] = ubyte_to_float(r); /* r */
         dst[1] = ubyte_to_float(g); /* g */
         dst[2] = ubyte_to_float(b); /* b */
         dst[3] = 1; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t r;
         uint32_t g;
         uint32_t b;
         r = (value) & 0xff;
         g = (value >> 8) & 0xff;
         b = (value >> 16) & 0xff;
         dst[0] = ubyte_to_float(r); /* r */
         dst[1] = ubyte_to_float(g); /* g */
         dst[2] = ubyte_to_float(b); /* b */
         dst[3] = 1; /* a */
#endif
}

static inline void
util_format_r8g8b8x8_unorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t r;
         uint32_t g;
         uint32_t b;
         r = value >> 24;
         g = (value >> 16) & 0xff;
         b = (value >> 8) & 0xff;
         dst[0] = r; /* r */
         dst[1] = g; /* g */
         dst[2] = b; /* b */
         dst[3] = 255; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t r;
         uint32_t g;
         uint32_t b;
         r = (value) & 0xff;
         g = (value >> 8) & 0xff;
         b = (value >> 16) & 0xff;
         dst[0] = r; /* r */
         dst[1] = g; /* g */
         dst[2] = b; /* b */
         dst[3] = 255; /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r8g8b8x8_unorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)(src[0]) << 24;
         value |= (uint32_t)((src[1]) & 0xff) << 16;
         value |= (uint32_t)((src[2]) & 0xff) << 8;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (src[0]) & 0xff;
         value |= (uint32_t)((src[1]) & 0xff) << 8;
         value |= (uint32_t)((src[2]) & 0xff) << 16;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r5g5b5a1_unorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t a;
         uint16_t b;
         uint16_t g;
         uint16_t r;
         a = value >> 15;
         b = (value >> 10) & 0x1f;
         g = (value >> 5) & 0x1f;
         r = (value) & 0x1f;
         dst[0] = (float)(r * (1.0f/0x1f)); /* r */
         dst[1] = (float)(g * (1.0f/0x1f)); /* g */
         dst[2] = (float)(b * (1.0f/0x1f)); /* b */
         dst[3] = (float)(a * (1.0f/0x1)); /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t r;
         uint16_t g;
         uint16_t b;
         uint16_t a;
         r = (value) & 0x1f;
         g = (value >> 5) & 0x1f;
         b = (value >> 10) & 0x1f;
         a = value >> 15;
         dst[0] = (float)(r * (1.0f/0x1f)); /* r */
         dst[1] = (float)(g * (1.0f/0x1f)); /* g */
         dst[2] = (float)(b * (1.0f/0x1f)); /* b */
         dst[3] = (float)(a * (1.0f/0x1)); /* a */
#endif
         src += 2;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r5g5b5a1_unorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)((uint16_t)util_iround(CLAMP(src[3], 0.0f, 1.0f) * 0x1)) << 15;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[2], 0.0f, 1.0f) * 0x1f)) & 0x1f) << 10;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[1], 0.0f, 1.0f) * 0x1f)) & 0x1f) << 5;
         value |= ((uint16_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0x1f)) & 0x1f;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= ((uint16_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0x1f)) & 0x1f;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[1], 0.0f, 1.0f) * 0x1f)) & 0x1f) << 5;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[2], 0.0f, 1.0f) * 0x1f)) & 0x1f) << 10;
         value |= (uint32_t)((uint16_t)util_iround(CLAMP(src[3], 0.0f, 1.0f) * 0x1)) << 15;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r5g5b5a1_unorm_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t a;
         uint16_t b;
         uint16_t g;
         uint16_t r;
         a = value >> 15;
         b = (value >> 10) & 0x1f;
         g = (value >> 5) & 0x1f;
         r = (value) & 0x1f;
         dst[0] = (float)(r * (1.0f/0x1f)); /* r */
         dst[1] = (float)(g * (1.0f/0x1f)); /* g */
         dst[2] = (float)(b * (1.0f/0x1f)); /* b */
         dst[3] = (float)(a * (1.0f/0x1)); /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t r;
         uint16_t g;
         uint16_t b;
         uint16_t a;
         r = (value) & 0x1f;
         g = (value >> 5) & 0x1f;
         b = (value >> 10) & 0x1f;
         a = value >> 15;
         dst[0] = (float)(r * (1.0f/0x1f)); /* r */
         dst[1] = (float)(g * (1.0f/0x1f)); /* g */
         dst[2] = (float)(b * (1.0f/0x1f)); /* b */
         dst[3] = (float)(a * (1.0f/0x1)); /* a */
#endif
}

static inline void
util_format_r5g5b5a1_unorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t a;
         uint16_t b;
         uint16_t g;
         uint16_t r;
         a = value >> 15;
         b = (value >> 10) & 0x1f;
         g = (value >> 5) & 0x1f;
         r = (value) & 0x1f;
         dst[0] = (uint8_t)(((uint32_t)r) * 0xff / 0x1f); /* r */
         dst[1] = (uint8_t)(((uint32_t)g) * 0xff / 0x1f); /* g */
         dst[2] = (uint8_t)(((uint32_t)b) * 0xff / 0x1f); /* b */
         dst[3] = (uint8_t)(((uint32_t)a) * 0xff / 0x1); /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t r;
         uint16_t g;
         uint16_t b;
         uint16_t a;
         r = (value) & 0x1f;
         g = (value >> 5) & 0x1f;
         b = (value >> 10) & 0x1f;
         a = value >> 15;
         dst[0] = (uint8_t)(((uint32_t)r) * 0xff / 0x1f); /* r */
         dst[1] = (uint8_t)(((uint32_t)g) * 0xff / 0x1f); /* g */
         dst[2] = (uint8_t)(((uint32_t)b) * 0xff / 0x1f); /* b */
         dst[3] = (uint8_t)(((uint32_t)a) * 0xff / 0x1); /* a */
#endif
         src += 2;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r5g5b5a1_unorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)((uint16_t)(src[3] >> 7)) << 15;
         value |= (uint32_t)(((uint16_t)(src[2] >> 3)) & 0x1f) << 10;
         value |= (uint32_t)(((uint16_t)(src[1] >> 3)) & 0x1f) << 5;
         value |= ((uint16_t)(src[0] >> 3)) & 0x1f;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= ((uint16_t)(src[0] >> 3)) & 0x1f;
         value |= (uint32_t)(((uint16_t)(src[1] >> 3)) & 0x1f) << 5;
         value |= (uint32_t)(((uint16_t)(src[2] >> 3)) & 0x1f) << 10;
         value |= (uint32_t)((uint16_t)(src[3] >> 7)) << 15;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_b5g5r5x1_unorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t r;
         uint16_t g;
         uint16_t b;
         r = (value >> 10) & 0x1f;
         g = (value >> 5) & 0x1f;
         b = (value) & 0x1f;
         dst[0] = (float)(r * (1.0f/0x1f)); /* r */
         dst[1] = (float)(g * (1.0f/0x1f)); /* g */
         dst[2] = (float)(b * (1.0f/0x1f)); /* b */
         dst[3] = 1; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t b;
         uint16_t g;
         uint16_t r;
         b = (value) & 0x1f;
         g = (value >> 5) & 0x1f;
         r = (value >> 10) & 0x1f;
         dst[0] = (float)(r * (1.0f/0x1f)); /* r */
         dst[1] = (float)(g * (1.0f/0x1f)); /* g */
         dst[2] = (float)(b * (1.0f/0x1f)); /* b */
         dst[3] = 1; /* a */
#endif
         src += 2;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_b5g5r5x1_unorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0x1f)) & 0x1f) << 10;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[1], 0.0f, 1.0f) * 0x1f)) & 0x1f) << 5;
         value |= ((uint16_t)util_iround(CLAMP(src[2], 0.0f, 1.0f) * 0x1f)) & 0x1f;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= ((uint16_t)util_iround(CLAMP(src[2], 0.0f, 1.0f) * 0x1f)) & 0x1f;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[1], 0.0f, 1.0f) * 0x1f)) & 0x1f) << 5;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0x1f)) & 0x1f) << 10;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_b5g5r5x1_unorm_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t r;
         uint16_t g;
         uint16_t b;
         r = (value >> 10) & 0x1f;
         g = (value >> 5) & 0x1f;
         b = (value) & 0x1f;
         dst[0] = (float)(r * (1.0f/0x1f)); /* r */
         dst[1] = (float)(g * (1.0f/0x1f)); /* g */
         dst[2] = (float)(b * (1.0f/0x1f)); /* b */
         dst[3] = 1; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t b;
         uint16_t g;
         uint16_t r;
         b = (value) & 0x1f;
         g = (value >> 5) & 0x1f;
         r = (value >> 10) & 0x1f;
         dst[0] = (float)(r * (1.0f/0x1f)); /* r */
         dst[1] = (float)(g * (1.0f/0x1f)); /* g */
         dst[2] = (float)(b * (1.0f/0x1f)); /* b */
         dst[3] = 1; /* a */
#endif
}

static inline void
util_format_b5g5r5x1_unorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t r;
         uint16_t g;
         uint16_t b;
         r = (value >> 10) & 0x1f;
         g = (value >> 5) & 0x1f;
         b = (value) & 0x1f;
         dst[0] = (uint8_t)(((uint32_t)r) * 0xff / 0x1f); /* r */
         dst[1] = (uint8_t)(((uint32_t)g) * 0xff / 0x1f); /* g */
         dst[2] = (uint8_t)(((uint32_t)b) * 0xff / 0x1f); /* b */
         dst[3] = 255; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t b;
         uint16_t g;
         uint16_t r;
         b = (value) & 0x1f;
         g = (value >> 5) & 0x1f;
         r = (value >> 10) & 0x1f;
         dst[0] = (uint8_t)(((uint32_t)r) * 0xff / 0x1f); /* r */
         dst[1] = (uint8_t)(((uint32_t)g) * 0xff / 0x1f); /* g */
         dst[2] = (uint8_t)(((uint32_t)b) * 0xff / 0x1f); /* b */
         dst[3] = 255; /* a */
#endif
         src += 2;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_b5g5r5x1_unorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)(((uint16_t)(src[0] >> 3)) & 0x1f) << 10;
         value |= (uint32_t)(((uint16_t)(src[1] >> 3)) & 0x1f) << 5;
         value |= ((uint16_t)(src[2] >> 3)) & 0x1f;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= ((uint16_t)(src[2] >> 3)) & 0x1f;
         value |= (uint32_t)(((uint16_t)(src[1] >> 3)) & 0x1f) << 5;
         value |= (uint32_t)(((uint16_t)(src[0] >> 3)) & 0x1f) << 10;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_b5g5r5a1_unorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t a;
         uint16_t r;
         uint16_t g;
         uint16_t b;
         a = value >> 15;
         r = (value >> 10) & 0x1f;
         g = (value >> 5) & 0x1f;
         b = (value) & 0x1f;
         dst[0] = (float)(r * (1.0f/0x1f)); /* r */
         dst[1] = (float)(g * (1.0f/0x1f)); /* g */
         dst[2] = (float)(b * (1.0f/0x1f)); /* b */
         dst[3] = (float)(a * (1.0f/0x1)); /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t b;
         uint16_t g;
         uint16_t r;
         uint16_t a;
         b = (value) & 0x1f;
         g = (value >> 5) & 0x1f;
         r = (value >> 10) & 0x1f;
         a = value >> 15;
         dst[0] = (float)(r * (1.0f/0x1f)); /* r */
         dst[1] = (float)(g * (1.0f/0x1f)); /* g */
         dst[2] = (float)(b * (1.0f/0x1f)); /* b */
         dst[3] = (float)(a * (1.0f/0x1)); /* a */
#endif
         src += 2;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_b5g5r5a1_unorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)((uint16_t)util_iround(CLAMP(src[3], 0.0f, 1.0f) * 0x1)) << 15;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0x1f)) & 0x1f) << 10;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[1], 0.0f, 1.0f) * 0x1f)) & 0x1f) << 5;
         value |= ((uint16_t)util_iround(CLAMP(src[2], 0.0f, 1.0f) * 0x1f)) & 0x1f;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= ((uint16_t)util_iround(CLAMP(src[2], 0.0f, 1.0f) * 0x1f)) & 0x1f;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[1], 0.0f, 1.0f) * 0x1f)) & 0x1f) << 5;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0x1f)) & 0x1f) << 10;
         value |= (uint32_t)((uint16_t)util_iround(CLAMP(src[3], 0.0f, 1.0f) * 0x1)) << 15;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_b5g5r5a1_unorm_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t a;
         uint16_t r;
         uint16_t g;
         uint16_t b;
         a = value >> 15;
         r = (value >> 10) & 0x1f;
         g = (value >> 5) & 0x1f;
         b = (value) & 0x1f;
         dst[0] = (float)(r * (1.0f/0x1f)); /* r */
         dst[1] = (float)(g * (1.0f/0x1f)); /* g */
         dst[2] = (float)(b * (1.0f/0x1f)); /* b */
         dst[3] = (float)(a * (1.0f/0x1)); /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t b;
         uint16_t g;
         uint16_t r;
         uint16_t a;
         b = (value) & 0x1f;
         g = (value >> 5) & 0x1f;
         r = (value >> 10) & 0x1f;
         a = value >> 15;
         dst[0] = (float)(r * (1.0f/0x1f)); /* r */
         dst[1] = (float)(g * (1.0f/0x1f)); /* g */
         dst[2] = (float)(b * (1.0f/0x1f)); /* b */
         dst[3] = (float)(a * (1.0f/0x1)); /* a */
#endif
}

static inline void
util_format_b5g5r5a1_unorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t a;
         uint16_t r;
         uint16_t g;
         uint16_t b;
         a = value >> 15;
         r = (value >> 10) & 0x1f;
         g = (value >> 5) & 0x1f;
         b = (value) & 0x1f;
         dst[0] = (uint8_t)(((uint32_t)r) * 0xff / 0x1f); /* r */
         dst[1] = (uint8_t)(((uint32_t)g) * 0xff / 0x1f); /* g */
         dst[2] = (uint8_t)(((uint32_t)b) * 0xff / 0x1f); /* b */
         dst[3] = (uint8_t)(((uint32_t)a) * 0xff / 0x1); /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t b;
         uint16_t g;
         uint16_t r;
         uint16_t a;
         b = (value) & 0x1f;
         g = (value >> 5) & 0x1f;
         r = (value >> 10) & 0x1f;
         a = value >> 15;
         dst[0] = (uint8_t)(((uint32_t)r) * 0xff / 0x1f); /* r */
         dst[1] = (uint8_t)(((uint32_t)g) * 0xff / 0x1f); /* g */
         dst[2] = (uint8_t)(((uint32_t)b) * 0xff / 0x1f); /* b */
         dst[3] = (uint8_t)(((uint32_t)a) * 0xff / 0x1); /* a */
#endif
         src += 2;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_b5g5r5a1_unorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)((uint16_t)(src[3] >> 7)) << 15;
         value |= (uint32_t)(((uint16_t)(src[0] >> 3)) & 0x1f) << 10;
         value |= (uint32_t)(((uint16_t)(src[1] >> 3)) & 0x1f) << 5;
         value |= ((uint16_t)(src[2] >> 3)) & 0x1f;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= ((uint16_t)(src[2] >> 3)) & 0x1f;
         value |= (uint32_t)(((uint16_t)(src[1] >> 3)) & 0x1f) << 5;
         value |= (uint32_t)(((uint16_t)(src[0] >> 3)) & 0x1f) << 10;
         value |= (uint32_t)((uint16_t)(src[3] >> 7)) << 15;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_x1b5g5r5_unorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t r;
         uint16_t g;
         uint16_t b;
         r = value >> 11;
         g = (value >> 6) & 0x1f;
         b = (value >> 1) & 0x1f;
         dst[0] = (float)(r * (1.0f/0x1f)); /* r */
         dst[1] = (float)(g * (1.0f/0x1f)); /* g */
         dst[2] = (float)(b * (1.0f/0x1f)); /* b */
         dst[3] = 1; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t b;
         uint16_t g;
         uint16_t r;
         b = (value >> 1) & 0x1f;
         g = (value >> 6) & 0x1f;
         r = value >> 11;
         dst[0] = (float)(r * (1.0f/0x1f)); /* r */
         dst[1] = (float)(g * (1.0f/0x1f)); /* g */
         dst[2] = (float)(b * (1.0f/0x1f)); /* b */
         dst[3] = 1; /* a */
#endif
         src += 2;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_x1b5g5r5_unorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)((uint16_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0x1f)) << 11;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[1], 0.0f, 1.0f) * 0x1f)) & 0x1f) << 6;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[2], 0.0f, 1.0f) * 0x1f)) & 0x1f) << 1;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[2], 0.0f, 1.0f) * 0x1f)) & 0x1f) << 1;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[1], 0.0f, 1.0f) * 0x1f)) & 0x1f) << 6;
         value |= (uint32_t)((uint16_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0x1f)) << 11;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_x1b5g5r5_unorm_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t r;
         uint16_t g;
         uint16_t b;
         r = value >> 11;
         g = (value >> 6) & 0x1f;
         b = (value >> 1) & 0x1f;
         dst[0] = (float)(r * (1.0f/0x1f)); /* r */
         dst[1] = (float)(g * (1.0f/0x1f)); /* g */
         dst[2] = (float)(b * (1.0f/0x1f)); /* b */
         dst[3] = 1; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t b;
         uint16_t g;
         uint16_t r;
         b = (value >> 1) & 0x1f;
         g = (value >> 6) & 0x1f;
         r = value >> 11;
         dst[0] = (float)(r * (1.0f/0x1f)); /* r */
         dst[1] = (float)(g * (1.0f/0x1f)); /* g */
         dst[2] = (float)(b * (1.0f/0x1f)); /* b */
         dst[3] = 1; /* a */
#endif
}

static inline void
util_format_x1b5g5r5_unorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t r;
         uint16_t g;
         uint16_t b;
         r = value >> 11;
         g = (value >> 6) & 0x1f;
         b = (value >> 1) & 0x1f;
         dst[0] = (uint8_t)(((uint32_t)r) * 0xff / 0x1f); /* r */
         dst[1] = (uint8_t)(((uint32_t)g) * 0xff / 0x1f); /* g */
         dst[2] = (uint8_t)(((uint32_t)b) * 0xff / 0x1f); /* b */
         dst[3] = 255; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t b;
         uint16_t g;
         uint16_t r;
         b = (value >> 1) & 0x1f;
         g = (value >> 6) & 0x1f;
         r = value >> 11;
         dst[0] = (uint8_t)(((uint32_t)r) * 0xff / 0x1f); /* r */
         dst[1] = (uint8_t)(((uint32_t)g) * 0xff / 0x1f); /* g */
         dst[2] = (uint8_t)(((uint32_t)b) * 0xff / 0x1f); /* b */
         dst[3] = 255; /* a */
#endif
         src += 2;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_x1b5g5r5_unorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)((uint16_t)(src[0] >> 3)) << 11;
         value |= (uint32_t)(((uint16_t)(src[1] >> 3)) & 0x1f) << 6;
         value |= (uint32_t)(((uint16_t)(src[2] >> 3)) & 0x1f) << 1;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= (uint32_t)(((uint16_t)(src[2] >> 3)) & 0x1f) << 1;
         value |= (uint32_t)(((uint16_t)(src[1] >> 3)) & 0x1f) << 6;
         value |= (uint32_t)((uint16_t)(src[0] >> 3)) << 11;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_a1r5g5b5_unorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t b;
         uint16_t g;
         uint16_t r;
         uint16_t a;
         b = value >> 11;
         g = (value >> 6) & 0x1f;
         r = (value >> 1) & 0x1f;
         a = (value) & 0x1;
         dst[0] = (float)(r * (1.0f/0x1f)); /* r */
         dst[1] = (float)(g * (1.0f/0x1f)); /* g */
         dst[2] = (float)(b * (1.0f/0x1f)); /* b */
         dst[3] = (float)(a * (1.0f/0x1)); /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t a;
         uint16_t r;
         uint16_t g;
         uint16_t b;
         a = (value) & 0x1;
         r = (value >> 1) & 0x1f;
         g = (value >> 6) & 0x1f;
         b = value >> 11;
         dst[0] = (float)(r * (1.0f/0x1f)); /* r */
         dst[1] = (float)(g * (1.0f/0x1f)); /* g */
         dst[2] = (float)(b * (1.0f/0x1f)); /* b */
         dst[3] = (float)(a * (1.0f/0x1)); /* a */
#endif
         src += 2;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_a1r5g5b5_unorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)((uint16_t)util_iround(CLAMP(src[2], 0.0f, 1.0f) * 0x1f)) << 11;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[1], 0.0f, 1.0f) * 0x1f)) & 0x1f) << 6;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0x1f)) & 0x1f) << 1;
         value |= ((uint16_t)util_iround(CLAMP(src[3], 0.0f, 1.0f) * 0x1)) & 0x1;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= ((uint16_t)util_iround(CLAMP(src[3], 0.0f, 1.0f) * 0x1)) & 0x1;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0x1f)) & 0x1f) << 1;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[1], 0.0f, 1.0f) * 0x1f)) & 0x1f) << 6;
         value |= (uint32_t)((uint16_t)util_iround(CLAMP(src[2], 0.0f, 1.0f) * 0x1f)) << 11;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_a1r5g5b5_unorm_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t b;
         uint16_t g;
         uint16_t r;
         uint16_t a;
         b = value >> 11;
         g = (value >> 6) & 0x1f;
         r = (value >> 1) & 0x1f;
         a = (value) & 0x1;
         dst[0] = (float)(r * (1.0f/0x1f)); /* r */
         dst[1] = (float)(g * (1.0f/0x1f)); /* g */
         dst[2] = (float)(b * (1.0f/0x1f)); /* b */
         dst[3] = (float)(a * (1.0f/0x1)); /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t a;
         uint16_t r;
         uint16_t g;
         uint16_t b;
         a = (value) & 0x1;
         r = (value >> 1) & 0x1f;
         g = (value >> 6) & 0x1f;
         b = value >> 11;
         dst[0] = (float)(r * (1.0f/0x1f)); /* r */
         dst[1] = (float)(g * (1.0f/0x1f)); /* g */
         dst[2] = (float)(b * (1.0f/0x1f)); /* b */
         dst[3] = (float)(a * (1.0f/0x1)); /* a */
#endif
}

static inline void
util_format_a1r5g5b5_unorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t b;
         uint16_t g;
         uint16_t r;
         uint16_t a;
         b = value >> 11;
         g = (value >> 6) & 0x1f;
         r = (value >> 1) & 0x1f;
         a = (value) & 0x1;
         dst[0] = (uint8_t)(((uint32_t)r) * 0xff / 0x1f); /* r */
         dst[1] = (uint8_t)(((uint32_t)g) * 0xff / 0x1f); /* g */
         dst[2] = (uint8_t)(((uint32_t)b) * 0xff / 0x1f); /* b */
         dst[3] = (uint8_t)(((uint32_t)a) * 0xff / 0x1); /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t a;
         uint16_t r;
         uint16_t g;
         uint16_t b;
         a = (value) & 0x1;
         r = (value >> 1) & 0x1f;
         g = (value >> 6) & 0x1f;
         b = value >> 11;
         dst[0] = (uint8_t)(((uint32_t)r) * 0xff / 0x1f); /* r */
         dst[1] = (uint8_t)(((uint32_t)g) * 0xff / 0x1f); /* g */
         dst[2] = (uint8_t)(((uint32_t)b) * 0xff / 0x1f); /* b */
         dst[3] = (uint8_t)(((uint32_t)a) * 0xff / 0x1); /* a */
#endif
         src += 2;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_a1r5g5b5_unorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)((uint16_t)(src[2] >> 3)) << 11;
         value |= (uint32_t)(((uint16_t)(src[1] >> 3)) & 0x1f) << 6;
         value |= (uint32_t)(((uint16_t)(src[0] >> 3)) & 0x1f) << 1;
         value |= ((uint16_t)(src[3] >> 7)) & 0x1;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= ((uint16_t)(src[3] >> 7)) & 0x1;
         value |= (uint32_t)(((uint16_t)(src[0] >> 3)) & 0x1f) << 1;
         value |= (uint32_t)(((uint16_t)(src[1] >> 3)) & 0x1f) << 6;
         value |= (uint32_t)((uint16_t)(src[2] >> 3)) << 11;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_a1b5g5r5_unorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t r;
         uint16_t g;
         uint16_t b;
         uint16_t a;
         r = value >> 11;
         g = (value >> 6) & 0x1f;
         b = (value >> 1) & 0x1f;
         a = (value) & 0x1;
         dst[0] = (float)(r * (1.0f/0x1f)); /* r */
         dst[1] = (float)(g * (1.0f/0x1f)); /* g */
         dst[2] = (float)(b * (1.0f/0x1f)); /* b */
         dst[3] = (float)(a * (1.0f/0x1)); /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t a;
         uint16_t b;
         uint16_t g;
         uint16_t r;
         a = (value) & 0x1;
         b = (value >> 1) & 0x1f;
         g = (value >> 6) & 0x1f;
         r = value >> 11;
         dst[0] = (float)(r * (1.0f/0x1f)); /* r */
         dst[1] = (float)(g * (1.0f/0x1f)); /* g */
         dst[2] = (float)(b * (1.0f/0x1f)); /* b */
         dst[3] = (float)(a * (1.0f/0x1)); /* a */
#endif
         src += 2;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_a1b5g5r5_unorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)((uint16_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0x1f)) << 11;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[1], 0.0f, 1.0f) * 0x1f)) & 0x1f) << 6;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[2], 0.0f, 1.0f) * 0x1f)) & 0x1f) << 1;
         value |= ((uint16_t)util_iround(CLAMP(src[3], 0.0f, 1.0f) * 0x1)) & 0x1;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= ((uint16_t)util_iround(CLAMP(src[3], 0.0f, 1.0f) * 0x1)) & 0x1;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[2], 0.0f, 1.0f) * 0x1f)) & 0x1f) << 1;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[1], 0.0f, 1.0f) * 0x1f)) & 0x1f) << 6;
         value |= (uint32_t)((uint16_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0x1f)) << 11;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_a1b5g5r5_unorm_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t r;
         uint16_t g;
         uint16_t b;
         uint16_t a;
         r = value >> 11;
         g = (value >> 6) & 0x1f;
         b = (value >> 1) & 0x1f;
         a = (value) & 0x1;
         dst[0] = (float)(r * (1.0f/0x1f)); /* r */
         dst[1] = (float)(g * (1.0f/0x1f)); /* g */
         dst[2] = (float)(b * (1.0f/0x1f)); /* b */
         dst[3] = (float)(a * (1.0f/0x1)); /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t a;
         uint16_t b;
         uint16_t g;
         uint16_t r;
         a = (value) & 0x1;
         b = (value >> 1) & 0x1f;
         g = (value >> 6) & 0x1f;
         r = value >> 11;
         dst[0] = (float)(r * (1.0f/0x1f)); /* r */
         dst[1] = (float)(g * (1.0f/0x1f)); /* g */
         dst[2] = (float)(b * (1.0f/0x1f)); /* b */
         dst[3] = (float)(a * (1.0f/0x1)); /* a */
#endif
}

static inline void
util_format_a1b5g5r5_unorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t r;
         uint16_t g;
         uint16_t b;
         uint16_t a;
         r = value >> 11;
         g = (value >> 6) & 0x1f;
         b = (value >> 1) & 0x1f;
         a = (value) & 0x1;
         dst[0] = (uint8_t)(((uint32_t)r) * 0xff / 0x1f); /* r */
         dst[1] = (uint8_t)(((uint32_t)g) * 0xff / 0x1f); /* g */
         dst[2] = (uint8_t)(((uint32_t)b) * 0xff / 0x1f); /* b */
         dst[3] = (uint8_t)(((uint32_t)a) * 0xff / 0x1); /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t a;
         uint16_t b;
         uint16_t g;
         uint16_t r;
         a = (value) & 0x1;
         b = (value >> 1) & 0x1f;
         g = (value >> 6) & 0x1f;
         r = value >> 11;
         dst[0] = (uint8_t)(((uint32_t)r) * 0xff / 0x1f); /* r */
         dst[1] = (uint8_t)(((uint32_t)g) * 0xff / 0x1f); /* g */
         dst[2] = (uint8_t)(((uint32_t)b) * 0xff / 0x1f); /* b */
         dst[3] = (uint8_t)(((uint32_t)a) * 0xff / 0x1); /* a */
#endif
         src += 2;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_a1b5g5r5_unorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)((uint16_t)(src[0] >> 3)) << 11;
         value |= (uint32_t)(((uint16_t)(src[1] >> 3)) & 0x1f) << 6;
         value |= (uint32_t)(((uint16_t)(src[2] >> 3)) & 0x1f) << 1;
         value |= ((uint16_t)(src[3] >> 7)) & 0x1;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= ((uint16_t)(src[3] >> 7)) & 0x1;
         value |= (uint32_t)(((uint16_t)(src[2] >> 3)) & 0x1f) << 1;
         value |= (uint32_t)(((uint16_t)(src[1] >> 3)) & 0x1f) << 6;
         value |= (uint32_t)((uint16_t)(src[0] >> 3)) << 11;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r4g4b4a4_unorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t a;
         uint16_t b;
         uint16_t g;
         uint16_t r;
         a = value >> 12;
         b = (value >> 8) & 0xf;
         g = (value >> 4) & 0xf;
         r = (value) & 0xf;
         dst[0] = (float)(r * (1.0f/0xf)); /* r */
         dst[1] = (float)(g * (1.0f/0xf)); /* g */
         dst[2] = (float)(b * (1.0f/0xf)); /* b */
         dst[3] = (float)(a * (1.0f/0xf)); /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t r;
         uint16_t g;
         uint16_t b;
         uint16_t a;
         r = (value) & 0xf;
         g = (value >> 4) & 0xf;
         b = (value >> 8) & 0xf;
         a = value >> 12;
         dst[0] = (float)(r * (1.0f/0xf)); /* r */
         dst[1] = (float)(g * (1.0f/0xf)); /* g */
         dst[2] = (float)(b * (1.0f/0xf)); /* b */
         dst[3] = (float)(a * (1.0f/0xf)); /* a */
#endif
         src += 2;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r4g4b4a4_unorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)((uint16_t)util_iround(CLAMP(src[3], 0.0f, 1.0f) * 0xf)) << 12;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[2], 0.0f, 1.0f) * 0xf)) & 0xf) << 8;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[1], 0.0f, 1.0f) * 0xf)) & 0xf) << 4;
         value |= ((uint16_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0xf)) & 0xf;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= ((uint16_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0xf)) & 0xf;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[1], 0.0f, 1.0f) * 0xf)) & 0xf) << 4;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[2], 0.0f, 1.0f) * 0xf)) & 0xf) << 8;
         value |= (uint32_t)((uint16_t)util_iround(CLAMP(src[3], 0.0f, 1.0f) * 0xf)) << 12;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r4g4b4a4_unorm_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t a;
         uint16_t b;
         uint16_t g;
         uint16_t r;
         a = value >> 12;
         b = (value >> 8) & 0xf;
         g = (value >> 4) & 0xf;
         r = (value) & 0xf;
         dst[0] = (float)(r * (1.0f/0xf)); /* r */
         dst[1] = (float)(g * (1.0f/0xf)); /* g */
         dst[2] = (float)(b * (1.0f/0xf)); /* b */
         dst[3] = (float)(a * (1.0f/0xf)); /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t r;
         uint16_t g;
         uint16_t b;
         uint16_t a;
         r = (value) & 0xf;
         g = (value >> 4) & 0xf;
         b = (value >> 8) & 0xf;
         a = value >> 12;
         dst[0] = (float)(r * (1.0f/0xf)); /* r */
         dst[1] = (float)(g * (1.0f/0xf)); /* g */
         dst[2] = (float)(b * (1.0f/0xf)); /* b */
         dst[3] = (float)(a * (1.0f/0xf)); /* a */
#endif
}

static inline void
util_format_r4g4b4a4_unorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t a;
         uint16_t b;
         uint16_t g;
         uint16_t r;
         a = value >> 12;
         b = (value >> 8) & 0xf;
         g = (value >> 4) & 0xf;
         r = (value) & 0xf;
         dst[0] = (uint8_t)(((uint32_t)r) * 0xff / 0xf); /* r */
         dst[1] = (uint8_t)(((uint32_t)g) * 0xff / 0xf); /* g */
         dst[2] = (uint8_t)(((uint32_t)b) * 0xff / 0xf); /* b */
         dst[3] = (uint8_t)(((uint32_t)a) * 0xff / 0xf); /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t r;
         uint16_t g;
         uint16_t b;
         uint16_t a;
         r = (value) & 0xf;
         g = (value >> 4) & 0xf;
         b = (value >> 8) & 0xf;
         a = value >> 12;
         dst[0] = (uint8_t)(((uint32_t)r) * 0xff / 0xf); /* r */
         dst[1] = (uint8_t)(((uint32_t)g) * 0xff / 0xf); /* g */
         dst[2] = (uint8_t)(((uint32_t)b) * 0xff / 0xf); /* b */
         dst[3] = (uint8_t)(((uint32_t)a) * 0xff / 0xf); /* a */
#endif
         src += 2;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r4g4b4a4_unorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)((uint16_t)(src[3] >> 4)) << 12;
         value |= (uint32_t)(((uint16_t)(src[2] >> 4)) & 0xf) << 8;
         value |= (uint32_t)(((uint16_t)(src[1] >> 4)) & 0xf) << 4;
         value |= ((uint16_t)(src[0] >> 4)) & 0xf;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= ((uint16_t)(src[0] >> 4)) & 0xf;
         value |= (uint32_t)(((uint16_t)(src[1] >> 4)) & 0xf) << 4;
         value |= (uint32_t)(((uint16_t)(src[2] >> 4)) & 0xf) << 8;
         value |= (uint32_t)((uint16_t)(src[3] >> 4)) << 12;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_b4g4r4a4_unorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t a;
         uint16_t r;
         uint16_t g;
         uint16_t b;
         a = value >> 12;
         r = (value >> 8) & 0xf;
         g = (value >> 4) & 0xf;
         b = (value) & 0xf;
         dst[0] = (float)(r * (1.0f/0xf)); /* r */
         dst[1] = (float)(g * (1.0f/0xf)); /* g */
         dst[2] = (float)(b * (1.0f/0xf)); /* b */
         dst[3] = (float)(a * (1.0f/0xf)); /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t b;
         uint16_t g;
         uint16_t r;
         uint16_t a;
         b = (value) & 0xf;
         g = (value >> 4) & 0xf;
         r = (value >> 8) & 0xf;
         a = value >> 12;
         dst[0] = (float)(r * (1.0f/0xf)); /* r */
         dst[1] = (float)(g * (1.0f/0xf)); /* g */
         dst[2] = (float)(b * (1.0f/0xf)); /* b */
         dst[3] = (float)(a * (1.0f/0xf)); /* a */
#endif
         src += 2;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_b4g4r4a4_unorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)((uint16_t)util_iround(CLAMP(src[3], 0.0f, 1.0f) * 0xf)) << 12;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0xf)) & 0xf) << 8;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[1], 0.0f, 1.0f) * 0xf)) & 0xf) << 4;
         value |= ((uint16_t)util_iround(CLAMP(src[2], 0.0f, 1.0f) * 0xf)) & 0xf;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= ((uint16_t)util_iround(CLAMP(src[2], 0.0f, 1.0f) * 0xf)) & 0xf;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[1], 0.0f, 1.0f) * 0xf)) & 0xf) << 4;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0xf)) & 0xf) << 8;
         value |= (uint32_t)((uint16_t)util_iround(CLAMP(src[3], 0.0f, 1.0f) * 0xf)) << 12;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_b4g4r4a4_unorm_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t a;
         uint16_t r;
         uint16_t g;
         uint16_t b;
         a = value >> 12;
         r = (value >> 8) & 0xf;
         g = (value >> 4) & 0xf;
         b = (value) & 0xf;
         dst[0] = (float)(r * (1.0f/0xf)); /* r */
         dst[1] = (float)(g * (1.0f/0xf)); /* g */
         dst[2] = (float)(b * (1.0f/0xf)); /* b */
         dst[3] = (float)(a * (1.0f/0xf)); /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t b;
         uint16_t g;
         uint16_t r;
         uint16_t a;
         b = (value) & 0xf;
         g = (value >> 4) & 0xf;
         r = (value >> 8) & 0xf;
         a = value >> 12;
         dst[0] = (float)(r * (1.0f/0xf)); /* r */
         dst[1] = (float)(g * (1.0f/0xf)); /* g */
         dst[2] = (float)(b * (1.0f/0xf)); /* b */
         dst[3] = (float)(a * (1.0f/0xf)); /* a */
#endif
}

static inline void
util_format_b4g4r4a4_unorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t a;
         uint16_t r;
         uint16_t g;
         uint16_t b;
         a = value >> 12;
         r = (value >> 8) & 0xf;
         g = (value >> 4) & 0xf;
         b = (value) & 0xf;
         dst[0] = (uint8_t)(((uint32_t)r) * 0xff / 0xf); /* r */
         dst[1] = (uint8_t)(((uint32_t)g) * 0xff / 0xf); /* g */
         dst[2] = (uint8_t)(((uint32_t)b) * 0xff / 0xf); /* b */
         dst[3] = (uint8_t)(((uint32_t)a) * 0xff / 0xf); /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t b;
         uint16_t g;
         uint16_t r;
         uint16_t a;
         b = (value) & 0xf;
         g = (value >> 4) & 0xf;
         r = (value >> 8) & 0xf;
         a = value >> 12;
         dst[0] = (uint8_t)(((uint32_t)r) * 0xff / 0xf); /* r */
         dst[1] = (uint8_t)(((uint32_t)g) * 0xff / 0xf); /* g */
         dst[2] = (uint8_t)(((uint32_t)b) * 0xff / 0xf); /* b */
         dst[3] = (uint8_t)(((uint32_t)a) * 0xff / 0xf); /* a */
#endif
         src += 2;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_b4g4r4a4_unorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)((uint16_t)(src[3] >> 4)) << 12;
         value |= (uint32_t)(((uint16_t)(src[0] >> 4)) & 0xf) << 8;
         value |= (uint32_t)(((uint16_t)(src[1] >> 4)) & 0xf) << 4;
         value |= ((uint16_t)(src[2] >> 4)) & 0xf;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= ((uint16_t)(src[2] >> 4)) & 0xf;
         value |= (uint32_t)(((uint16_t)(src[1] >> 4)) & 0xf) << 4;
         value |= (uint32_t)(((uint16_t)(src[0] >> 4)) & 0xf) << 8;
         value |= (uint32_t)((uint16_t)(src[3] >> 4)) << 12;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_b4g4r4x4_unorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t r;
         uint16_t g;
         uint16_t b;
         r = (value >> 8) & 0xf;
         g = (value >> 4) & 0xf;
         b = (value) & 0xf;
         dst[0] = (float)(r * (1.0f/0xf)); /* r */
         dst[1] = (float)(g * (1.0f/0xf)); /* g */
         dst[2] = (float)(b * (1.0f/0xf)); /* b */
         dst[3] = 1; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t b;
         uint16_t g;
         uint16_t r;
         b = (value) & 0xf;
         g = (value >> 4) & 0xf;
         r = (value >> 8) & 0xf;
         dst[0] = (float)(r * (1.0f/0xf)); /* r */
         dst[1] = (float)(g * (1.0f/0xf)); /* g */
         dst[2] = (float)(b * (1.0f/0xf)); /* b */
         dst[3] = 1; /* a */
#endif
         src += 2;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_b4g4r4x4_unorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0xf)) & 0xf) << 8;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[1], 0.0f, 1.0f) * 0xf)) & 0xf) << 4;
         value |= ((uint16_t)util_iround(CLAMP(src[2], 0.0f, 1.0f) * 0xf)) & 0xf;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= ((uint16_t)util_iround(CLAMP(src[2], 0.0f, 1.0f) * 0xf)) & 0xf;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[1], 0.0f, 1.0f) * 0xf)) & 0xf) << 4;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0xf)) & 0xf) << 8;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_b4g4r4x4_unorm_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t r;
         uint16_t g;
         uint16_t b;
         r = (value >> 8) & 0xf;
         g = (value >> 4) & 0xf;
         b = (value) & 0xf;
         dst[0] = (float)(r * (1.0f/0xf)); /* r */
         dst[1] = (float)(g * (1.0f/0xf)); /* g */
         dst[2] = (float)(b * (1.0f/0xf)); /* b */
         dst[3] = 1; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t b;
         uint16_t g;
         uint16_t r;
         b = (value) & 0xf;
         g = (value >> 4) & 0xf;
         r = (value >> 8) & 0xf;
         dst[0] = (float)(r * (1.0f/0xf)); /* r */
         dst[1] = (float)(g * (1.0f/0xf)); /* g */
         dst[2] = (float)(b * (1.0f/0xf)); /* b */
         dst[3] = 1; /* a */
#endif
}

static inline void
util_format_b4g4r4x4_unorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t r;
         uint16_t g;
         uint16_t b;
         r = (value >> 8) & 0xf;
         g = (value >> 4) & 0xf;
         b = (value) & 0xf;
         dst[0] = (uint8_t)(((uint32_t)r) * 0xff / 0xf); /* r */
         dst[1] = (uint8_t)(((uint32_t)g) * 0xff / 0xf); /* g */
         dst[2] = (uint8_t)(((uint32_t)b) * 0xff / 0xf); /* b */
         dst[3] = 255; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t b;
         uint16_t g;
         uint16_t r;
         b = (value) & 0xf;
         g = (value >> 4) & 0xf;
         r = (value >> 8) & 0xf;
         dst[0] = (uint8_t)(((uint32_t)r) * 0xff / 0xf); /* r */
         dst[1] = (uint8_t)(((uint32_t)g) * 0xff / 0xf); /* g */
         dst[2] = (uint8_t)(((uint32_t)b) * 0xff / 0xf); /* b */
         dst[3] = 255; /* a */
#endif
         src += 2;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_b4g4r4x4_unorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)(((uint16_t)(src[0] >> 4)) & 0xf) << 8;
         value |= (uint32_t)(((uint16_t)(src[1] >> 4)) & 0xf) << 4;
         value |= ((uint16_t)(src[2] >> 4)) & 0xf;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= ((uint16_t)(src[2] >> 4)) & 0xf;
         value |= (uint32_t)(((uint16_t)(src[1] >> 4)) & 0xf) << 4;
         value |= (uint32_t)(((uint16_t)(src[0] >> 4)) & 0xf) << 8;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_a4r4g4b4_unorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t b;
         uint16_t g;
         uint16_t r;
         uint16_t a;
         b = value >> 12;
         g = (value >> 8) & 0xf;
         r = (value >> 4) & 0xf;
         a = (value) & 0xf;
         dst[0] = (float)(r * (1.0f/0xf)); /* r */
         dst[1] = (float)(g * (1.0f/0xf)); /* g */
         dst[2] = (float)(b * (1.0f/0xf)); /* b */
         dst[3] = (float)(a * (1.0f/0xf)); /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t a;
         uint16_t r;
         uint16_t g;
         uint16_t b;
         a = (value) & 0xf;
         r = (value >> 4) & 0xf;
         g = (value >> 8) & 0xf;
         b = value >> 12;
         dst[0] = (float)(r * (1.0f/0xf)); /* r */
         dst[1] = (float)(g * (1.0f/0xf)); /* g */
         dst[2] = (float)(b * (1.0f/0xf)); /* b */
         dst[3] = (float)(a * (1.0f/0xf)); /* a */
#endif
         src += 2;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_a4r4g4b4_unorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)((uint16_t)util_iround(CLAMP(src[2], 0.0f, 1.0f) * 0xf)) << 12;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[1], 0.0f, 1.0f) * 0xf)) & 0xf) << 8;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0xf)) & 0xf) << 4;
         value |= ((uint16_t)util_iround(CLAMP(src[3], 0.0f, 1.0f) * 0xf)) & 0xf;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= ((uint16_t)util_iround(CLAMP(src[3], 0.0f, 1.0f) * 0xf)) & 0xf;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0xf)) & 0xf) << 4;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[1], 0.0f, 1.0f) * 0xf)) & 0xf) << 8;
         value |= (uint32_t)((uint16_t)util_iround(CLAMP(src[2], 0.0f, 1.0f) * 0xf)) << 12;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_a4r4g4b4_unorm_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t b;
         uint16_t g;
         uint16_t r;
         uint16_t a;
         b = value >> 12;
         g = (value >> 8) & 0xf;
         r = (value >> 4) & 0xf;
         a = (value) & 0xf;
         dst[0] = (float)(r * (1.0f/0xf)); /* r */
         dst[1] = (float)(g * (1.0f/0xf)); /* g */
         dst[2] = (float)(b * (1.0f/0xf)); /* b */
         dst[3] = (float)(a * (1.0f/0xf)); /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t a;
         uint16_t r;
         uint16_t g;
         uint16_t b;
         a = (value) & 0xf;
         r = (value >> 4) & 0xf;
         g = (value >> 8) & 0xf;
         b = value >> 12;
         dst[0] = (float)(r * (1.0f/0xf)); /* r */
         dst[1] = (float)(g * (1.0f/0xf)); /* g */
         dst[2] = (float)(b * (1.0f/0xf)); /* b */
         dst[3] = (float)(a * (1.0f/0xf)); /* a */
#endif
}

static inline void
util_format_a4r4g4b4_unorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t b;
         uint16_t g;
         uint16_t r;
         uint16_t a;
         b = value >> 12;
         g = (value >> 8) & 0xf;
         r = (value >> 4) & 0xf;
         a = (value) & 0xf;
         dst[0] = (uint8_t)(((uint32_t)r) * 0xff / 0xf); /* r */
         dst[1] = (uint8_t)(((uint32_t)g) * 0xff / 0xf); /* g */
         dst[2] = (uint8_t)(((uint32_t)b) * 0xff / 0xf); /* b */
         dst[3] = (uint8_t)(((uint32_t)a) * 0xff / 0xf); /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t a;
         uint16_t r;
         uint16_t g;
         uint16_t b;
         a = (value) & 0xf;
         r = (value >> 4) & 0xf;
         g = (value >> 8) & 0xf;
         b = value >> 12;
         dst[0] = (uint8_t)(((uint32_t)r) * 0xff / 0xf); /* r */
         dst[1] = (uint8_t)(((uint32_t)g) * 0xff / 0xf); /* g */
         dst[2] = (uint8_t)(((uint32_t)b) * 0xff / 0xf); /* b */
         dst[3] = (uint8_t)(((uint32_t)a) * 0xff / 0xf); /* a */
#endif
         src += 2;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_a4r4g4b4_unorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)((uint16_t)(src[2] >> 4)) << 12;
         value |= (uint32_t)(((uint16_t)(src[1] >> 4)) & 0xf) << 8;
         value |= (uint32_t)(((uint16_t)(src[0] >> 4)) & 0xf) << 4;
         value |= ((uint16_t)(src[3] >> 4)) & 0xf;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= ((uint16_t)(src[3] >> 4)) & 0xf;
         value |= (uint32_t)(((uint16_t)(src[0] >> 4)) & 0xf) << 4;
         value |= (uint32_t)(((uint16_t)(src[1] >> 4)) & 0xf) << 8;
         value |= (uint32_t)((uint16_t)(src[2] >> 4)) << 12;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_a4b4g4r4_unorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t r;
         uint16_t g;
         uint16_t b;
         uint16_t a;
         r = value >> 12;
         g = (value >> 8) & 0xf;
         b = (value >> 4) & 0xf;
         a = (value) & 0xf;
         dst[0] = (float)(r * (1.0f/0xf)); /* r */
         dst[1] = (float)(g * (1.0f/0xf)); /* g */
         dst[2] = (float)(b * (1.0f/0xf)); /* b */
         dst[3] = (float)(a * (1.0f/0xf)); /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t a;
         uint16_t b;
         uint16_t g;
         uint16_t r;
         a = (value) & 0xf;
         b = (value >> 4) & 0xf;
         g = (value >> 8) & 0xf;
         r = value >> 12;
         dst[0] = (float)(r * (1.0f/0xf)); /* r */
         dst[1] = (float)(g * (1.0f/0xf)); /* g */
         dst[2] = (float)(b * (1.0f/0xf)); /* b */
         dst[3] = (float)(a * (1.0f/0xf)); /* a */
#endif
         src += 2;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_a4b4g4r4_unorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)((uint16_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0xf)) << 12;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[1], 0.0f, 1.0f) * 0xf)) & 0xf) << 8;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[2], 0.0f, 1.0f) * 0xf)) & 0xf) << 4;
         value |= ((uint16_t)util_iround(CLAMP(src[3], 0.0f, 1.0f) * 0xf)) & 0xf;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= ((uint16_t)util_iround(CLAMP(src[3], 0.0f, 1.0f) * 0xf)) & 0xf;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[2], 0.0f, 1.0f) * 0xf)) & 0xf) << 4;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[1], 0.0f, 1.0f) * 0xf)) & 0xf) << 8;
         value |= (uint32_t)((uint16_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0xf)) << 12;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_a4b4g4r4_unorm_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t r;
         uint16_t g;
         uint16_t b;
         uint16_t a;
         r = value >> 12;
         g = (value >> 8) & 0xf;
         b = (value >> 4) & 0xf;
         a = (value) & 0xf;
         dst[0] = (float)(r * (1.0f/0xf)); /* r */
         dst[1] = (float)(g * (1.0f/0xf)); /* g */
         dst[2] = (float)(b * (1.0f/0xf)); /* b */
         dst[3] = (float)(a * (1.0f/0xf)); /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t a;
         uint16_t b;
         uint16_t g;
         uint16_t r;
         a = (value) & 0xf;
         b = (value >> 4) & 0xf;
         g = (value >> 8) & 0xf;
         r = value >> 12;
         dst[0] = (float)(r * (1.0f/0xf)); /* r */
         dst[1] = (float)(g * (1.0f/0xf)); /* g */
         dst[2] = (float)(b * (1.0f/0xf)); /* b */
         dst[3] = (float)(a * (1.0f/0xf)); /* a */
#endif
}

static inline void
util_format_a4b4g4r4_unorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t r;
         uint16_t g;
         uint16_t b;
         uint16_t a;
         r = value >> 12;
         g = (value >> 8) & 0xf;
         b = (value >> 4) & 0xf;
         a = (value) & 0xf;
         dst[0] = (uint8_t)(((uint32_t)r) * 0xff / 0xf); /* r */
         dst[1] = (uint8_t)(((uint32_t)g) * 0xff / 0xf); /* g */
         dst[2] = (uint8_t)(((uint32_t)b) * 0xff / 0xf); /* b */
         dst[3] = (uint8_t)(((uint32_t)a) * 0xff / 0xf); /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t a;
         uint16_t b;
         uint16_t g;
         uint16_t r;
         a = (value) & 0xf;
         b = (value >> 4) & 0xf;
         g = (value >> 8) & 0xf;
         r = value >> 12;
         dst[0] = (uint8_t)(((uint32_t)r) * 0xff / 0xf); /* r */
         dst[1] = (uint8_t)(((uint32_t)g) * 0xff / 0xf); /* g */
         dst[2] = (uint8_t)(((uint32_t)b) * 0xff / 0xf); /* b */
         dst[3] = (uint8_t)(((uint32_t)a) * 0xff / 0xf); /* a */
#endif
         src += 2;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_a4b4g4r4_unorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)((uint16_t)(src[0] >> 4)) << 12;
         value |= (uint32_t)(((uint16_t)(src[1] >> 4)) & 0xf) << 8;
         value |= (uint32_t)(((uint16_t)(src[2] >> 4)) & 0xf) << 4;
         value |= ((uint16_t)(src[3] >> 4)) & 0xf;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= ((uint16_t)(src[3] >> 4)) & 0xf;
         value |= (uint32_t)(((uint16_t)(src[2] >> 4)) & 0xf) << 4;
         value |= (uint32_t)(((uint16_t)(src[1] >> 4)) & 0xf) << 8;
         value |= (uint32_t)((uint16_t)(src[0] >> 4)) << 12;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r5g6b5_unorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t b;
         uint16_t g;
         uint16_t r;
         b = value >> 11;
         g = (value >> 5) & 0x3f;
         r = (value) & 0x1f;
         dst[0] = (float)(r * (1.0f/0x1f)); /* r */
         dst[1] = (float)(g * (1.0f/0x3f)); /* g */
         dst[2] = (float)(b * (1.0f/0x1f)); /* b */
         dst[3] = 1; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t r;
         uint16_t g;
         uint16_t b;
         r = (value) & 0x1f;
         g = (value >> 5) & 0x3f;
         b = value >> 11;
         dst[0] = (float)(r * (1.0f/0x1f)); /* r */
         dst[1] = (float)(g * (1.0f/0x3f)); /* g */
         dst[2] = (float)(b * (1.0f/0x1f)); /* b */
         dst[3] = 1; /* a */
#endif
         src += 2;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r5g6b5_unorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)((uint16_t)util_iround(CLAMP(src[2], 0.0f, 1.0f) * 0x1f)) << 11;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[1], 0.0f, 1.0f) * 0x3f)) & 0x3f) << 5;
         value |= ((uint16_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0x1f)) & 0x1f;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= ((uint16_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0x1f)) & 0x1f;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[1], 0.0f, 1.0f) * 0x3f)) & 0x3f) << 5;
         value |= (uint32_t)((uint16_t)util_iround(CLAMP(src[2], 0.0f, 1.0f) * 0x1f)) << 11;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r5g6b5_unorm_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t b;
         uint16_t g;
         uint16_t r;
         b = value >> 11;
         g = (value >> 5) & 0x3f;
         r = (value) & 0x1f;
         dst[0] = (float)(r * (1.0f/0x1f)); /* r */
         dst[1] = (float)(g * (1.0f/0x3f)); /* g */
         dst[2] = (float)(b * (1.0f/0x1f)); /* b */
         dst[3] = 1; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t r;
         uint16_t g;
         uint16_t b;
         r = (value) & 0x1f;
         g = (value >> 5) & 0x3f;
         b = value >> 11;
         dst[0] = (float)(r * (1.0f/0x1f)); /* r */
         dst[1] = (float)(g * (1.0f/0x3f)); /* g */
         dst[2] = (float)(b * (1.0f/0x1f)); /* b */
         dst[3] = 1; /* a */
#endif
}

static inline void
util_format_r5g6b5_unorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t b;
         uint16_t g;
         uint16_t r;
         b = value >> 11;
         g = (value >> 5) & 0x3f;
         r = (value) & 0x1f;
         dst[0] = (uint8_t)(((uint32_t)r) * 0xff / 0x1f); /* r */
         dst[1] = (uint8_t)(((uint32_t)g) * 0xff / 0x3f); /* g */
         dst[2] = (uint8_t)(((uint32_t)b) * 0xff / 0x1f); /* b */
         dst[3] = 255; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t r;
         uint16_t g;
         uint16_t b;
         r = (value) & 0x1f;
         g = (value >> 5) & 0x3f;
         b = value >> 11;
         dst[0] = (uint8_t)(((uint32_t)r) * 0xff / 0x1f); /* r */
         dst[1] = (uint8_t)(((uint32_t)g) * 0xff / 0x3f); /* g */
         dst[2] = (uint8_t)(((uint32_t)b) * 0xff / 0x1f); /* b */
         dst[3] = 255; /* a */
#endif
         src += 2;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r5g6b5_unorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)((uint16_t)(src[2] >> 3)) << 11;
         value |= (uint32_t)(((uint16_t)(src[1] >> 2)) & 0x3f) << 5;
         value |= ((uint16_t)(src[0] >> 3)) & 0x1f;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= ((uint16_t)(src[0] >> 3)) & 0x1f;
         value |= (uint32_t)(((uint16_t)(src[1] >> 2)) & 0x3f) << 5;
         value |= (uint32_t)((uint16_t)(src[2] >> 3)) << 11;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_b5g6r5_unorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t r;
         uint16_t g;
         uint16_t b;
         r = value >> 11;
         g = (value >> 5) & 0x3f;
         b = (value) & 0x1f;
         dst[0] = (float)(r * (1.0f/0x1f)); /* r */
         dst[1] = (float)(g * (1.0f/0x3f)); /* g */
         dst[2] = (float)(b * (1.0f/0x1f)); /* b */
         dst[3] = 1; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t b;
         uint16_t g;
         uint16_t r;
         b = (value) & 0x1f;
         g = (value >> 5) & 0x3f;
         r = value >> 11;
         dst[0] = (float)(r * (1.0f/0x1f)); /* r */
         dst[1] = (float)(g * (1.0f/0x3f)); /* g */
         dst[2] = (float)(b * (1.0f/0x1f)); /* b */
         dst[3] = 1; /* a */
#endif
         src += 2;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_b5g6r5_unorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)((uint16_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0x1f)) << 11;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[1], 0.0f, 1.0f) * 0x3f)) & 0x3f) << 5;
         value |= ((uint16_t)util_iround(CLAMP(src[2], 0.0f, 1.0f) * 0x1f)) & 0x1f;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= ((uint16_t)util_iround(CLAMP(src[2], 0.0f, 1.0f) * 0x1f)) & 0x1f;
         value |= (uint32_t)(((uint16_t)util_iround(CLAMP(src[1], 0.0f, 1.0f) * 0x3f)) & 0x3f) << 5;
         value |= (uint32_t)((uint16_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0x1f)) << 11;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_b5g6r5_unorm_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t r;
         uint16_t g;
         uint16_t b;
         r = value >> 11;
         g = (value >> 5) & 0x3f;
         b = (value) & 0x1f;
         dst[0] = (float)(r * (1.0f/0x1f)); /* r */
         dst[1] = (float)(g * (1.0f/0x3f)); /* g */
         dst[2] = (float)(b * (1.0f/0x1f)); /* b */
         dst[3] = 1; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t b;
         uint16_t g;
         uint16_t r;
         b = (value) & 0x1f;
         g = (value >> 5) & 0x3f;
         r = value >> 11;
         dst[0] = (float)(r * (1.0f/0x1f)); /* r */
         dst[1] = (float)(g * (1.0f/0x3f)); /* g */
         dst[2] = (float)(b * (1.0f/0x1f)); /* b */
         dst[3] = 1; /* a */
#endif
}

static inline void
util_format_b5g6r5_unorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t r;
         uint16_t g;
         uint16_t b;
         r = value >> 11;
         g = (value >> 5) & 0x3f;
         b = (value) & 0x1f;
         dst[0] = (uint8_t)(((uint32_t)r) * 0xff / 0x1f); /* r */
         dst[1] = (uint8_t)(((uint32_t)g) * 0xff / 0x3f); /* g */
         dst[2] = (uint8_t)(((uint32_t)b) * 0xff / 0x1f); /* b */
         dst[3] = 255; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t b;
         uint16_t g;
         uint16_t r;
         b = (value) & 0x1f;
         g = (value >> 5) & 0x3f;
         r = value >> 11;
         dst[0] = (uint8_t)(((uint32_t)r) * 0xff / 0x1f); /* r */
         dst[1] = (uint8_t)(((uint32_t)g) * 0xff / 0x3f); /* g */
         dst[2] = (uint8_t)(((uint32_t)b) * 0xff / 0x1f); /* b */
         dst[3] = 255; /* a */
#endif
         src += 2;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_b5g6r5_unorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)((uint16_t)(src[0] >> 3)) << 11;
         value |= (uint32_t)(((uint16_t)(src[1] >> 2)) & 0x3f) << 5;
         value |= ((uint16_t)(src[2] >> 3)) & 0x1f;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= ((uint16_t)(src[2] >> 3)) & 0x1f;
         value |= (uint32_t)(((uint16_t)(src[1] >> 2)) & 0x3f) << 5;
         value |= (uint32_t)((uint16_t)(src[0] >> 3)) << 11;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r10g10b10a2_unorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t a;
         uint32_t b;
         uint32_t g;
         uint32_t r;
         a = value >> 30;
         b = (value >> 20) & 0x3ff;
         g = (value >> 10) & 0x3ff;
         r = (value) & 0x3ff;
         dst[0] = (float)(r * (1.0f/0x3ff)); /* r */
         dst[1] = (float)(g * (1.0f/0x3ff)); /* g */
         dst[2] = (float)(b * (1.0f/0x3ff)); /* b */
         dst[3] = (float)(a * (1.0f/0x3)); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t r;
         uint32_t g;
         uint32_t b;
         uint32_t a;
         r = (value) & 0x3ff;
         g = (value >> 10) & 0x3ff;
         b = (value >> 20) & 0x3ff;
         a = value >> 30;
         dst[0] = (float)(r * (1.0f/0x3ff)); /* r */
         dst[1] = (float)(g * (1.0f/0x3ff)); /* g */
         dst[2] = (float)(b * (1.0f/0x3ff)); /* b */
         dst[3] = (float)(a * (1.0f/0x3)); /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r10g10b10a2_unorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)util_iround(CLAMP(src[3], 0.0f, 1.0f) * 0x3)) << 30;
         value |= (uint32_t)(((uint32_t)util_iround(CLAMP(src[2], 0.0f, 1.0f) * 0x3ff)) & 0x3ff) << 20;
         value |= (uint32_t)(((uint32_t)util_iround(CLAMP(src[1], 0.0f, 1.0f) * 0x3ff)) & 0x3ff) << 10;
         value |= ((uint32_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0x3ff)) & 0x3ff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= ((uint32_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0x3ff)) & 0x3ff;
         value |= (uint32_t)(((uint32_t)util_iround(CLAMP(src[1], 0.0f, 1.0f) * 0x3ff)) & 0x3ff) << 10;
         value |= (uint32_t)(((uint32_t)util_iround(CLAMP(src[2], 0.0f, 1.0f) * 0x3ff)) & 0x3ff) << 20;
         value |= (uint32_t)((uint32_t)util_iround(CLAMP(src[3], 0.0f, 1.0f) * 0x3)) << 30;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r10g10b10a2_unorm_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t a;
         uint32_t b;
         uint32_t g;
         uint32_t r;
         a = value >> 30;
         b = (value >> 20) & 0x3ff;
         g = (value >> 10) & 0x3ff;
         r = (value) & 0x3ff;
         dst[0] = (float)(r * (1.0f/0x3ff)); /* r */
         dst[1] = (float)(g * (1.0f/0x3ff)); /* g */
         dst[2] = (float)(b * (1.0f/0x3ff)); /* b */
         dst[3] = (float)(a * (1.0f/0x3)); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t r;
         uint32_t g;
         uint32_t b;
         uint32_t a;
         r = (value) & 0x3ff;
         g = (value >> 10) & 0x3ff;
         b = (value >> 20) & 0x3ff;
         a = value >> 30;
         dst[0] = (float)(r * (1.0f/0x3ff)); /* r */
         dst[1] = (float)(g * (1.0f/0x3ff)); /* g */
         dst[2] = (float)(b * (1.0f/0x3ff)); /* b */
         dst[3] = (float)(a * (1.0f/0x3)); /* a */
#endif
}

static inline void
util_format_r10g10b10a2_unorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t a;
         uint32_t b;
         uint32_t g;
         uint32_t r;
         a = value >> 30;
         b = (value >> 20) & 0x3ff;
         g = (value >> 10) & 0x3ff;
         r = (value) & 0x3ff;
         dst[0] = (uint8_t)(r >> 2); /* r */
         dst[1] = (uint8_t)(g >> 2); /* g */
         dst[2] = (uint8_t)(b >> 2); /* b */
         dst[3] = (uint8_t)(((uint32_t)a) * 0xff / 0x3); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t r;
         uint32_t g;
         uint32_t b;
         uint32_t a;
         r = (value) & 0x3ff;
         g = (value >> 10) & 0x3ff;
         b = (value >> 20) & 0x3ff;
         a = value >> 30;
         dst[0] = (uint8_t)(r >> 2); /* r */
         dst[1] = (uint8_t)(g >> 2); /* g */
         dst[2] = (uint8_t)(b >> 2); /* b */
         dst[3] = (uint8_t)(((uint32_t)a) * 0xff / 0x3); /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r10g10b10a2_unorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)(src[3] >> 6)) << 30;
         value |= (uint32_t)(((uint32_t)(((uint32_t)src[2]) * 0x3ff / 0xff)) & 0x3ff) << 20;
         value |= (uint32_t)(((uint32_t)(((uint32_t)src[1]) * 0x3ff / 0xff)) & 0x3ff) << 10;
         value |= ((uint32_t)(((uint32_t)src[0]) * 0x3ff / 0xff)) & 0x3ff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= ((uint32_t)(((uint32_t)src[0]) * 0x3ff / 0xff)) & 0x3ff;
         value |= (uint32_t)(((uint32_t)(((uint32_t)src[1]) * 0x3ff / 0xff)) & 0x3ff) << 10;
         value |= (uint32_t)(((uint32_t)(((uint32_t)src[2]) * 0x3ff / 0xff)) & 0x3ff) << 20;
         value |= (uint32_t)((uint32_t)(src[3] >> 6)) << 30;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r10g10b10x2_unorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t b;
         uint32_t g;
         uint32_t r;
         b = (value >> 20) & 0x3ff;
         g = (value >> 10) & 0x3ff;
         r = (value) & 0x3ff;
         dst[0] = (float)(r * (1.0f/0x3ff)); /* r */
         dst[1] = (float)(g * (1.0f/0x3ff)); /* g */
         dst[2] = (float)(b * (1.0f/0x3ff)); /* b */
         dst[3] = 1; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t r;
         uint32_t g;
         uint32_t b;
         r = (value) & 0x3ff;
         g = (value >> 10) & 0x3ff;
         b = (value >> 20) & 0x3ff;
         dst[0] = (float)(r * (1.0f/0x3ff)); /* r */
         dst[1] = (float)(g * (1.0f/0x3ff)); /* g */
         dst[2] = (float)(b * (1.0f/0x3ff)); /* b */
         dst[3] = 1; /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r10g10b10x2_unorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)(((uint32_t)util_iround(CLAMP(src[2], 0.0f, 1.0f) * 0x3ff)) & 0x3ff) << 20;
         value |= (uint32_t)(((uint32_t)util_iround(CLAMP(src[1], 0.0f, 1.0f) * 0x3ff)) & 0x3ff) << 10;
         value |= ((uint32_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0x3ff)) & 0x3ff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= ((uint32_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0x3ff)) & 0x3ff;
         value |= (uint32_t)(((uint32_t)util_iround(CLAMP(src[1], 0.0f, 1.0f) * 0x3ff)) & 0x3ff) << 10;
         value |= (uint32_t)(((uint32_t)util_iround(CLAMP(src[2], 0.0f, 1.0f) * 0x3ff)) & 0x3ff) << 20;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r10g10b10x2_unorm_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t b;
         uint32_t g;
         uint32_t r;
         b = (value >> 20) & 0x3ff;
         g = (value >> 10) & 0x3ff;
         r = (value) & 0x3ff;
         dst[0] = (float)(r * (1.0f/0x3ff)); /* r */
         dst[1] = (float)(g * (1.0f/0x3ff)); /* g */
         dst[2] = (float)(b * (1.0f/0x3ff)); /* b */
         dst[3] = 1; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t r;
         uint32_t g;
         uint32_t b;
         r = (value) & 0x3ff;
         g = (value >> 10) & 0x3ff;
         b = (value >> 20) & 0x3ff;
         dst[0] = (float)(r * (1.0f/0x3ff)); /* r */
         dst[1] = (float)(g * (1.0f/0x3ff)); /* g */
         dst[2] = (float)(b * (1.0f/0x3ff)); /* b */
         dst[3] = 1; /* a */
#endif
}

static inline void
util_format_r10g10b10x2_unorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t b;
         uint32_t g;
         uint32_t r;
         b = (value >> 20) & 0x3ff;
         g = (value >> 10) & 0x3ff;
         r = (value) & 0x3ff;
         dst[0] = (uint8_t)(r >> 2); /* r */
         dst[1] = (uint8_t)(g >> 2); /* g */
         dst[2] = (uint8_t)(b >> 2); /* b */
         dst[3] = 255; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t r;
         uint32_t g;
         uint32_t b;
         r = (value) & 0x3ff;
         g = (value >> 10) & 0x3ff;
         b = (value >> 20) & 0x3ff;
         dst[0] = (uint8_t)(r >> 2); /* r */
         dst[1] = (uint8_t)(g >> 2); /* g */
         dst[2] = (uint8_t)(b >> 2); /* b */
         dst[3] = 255; /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r10g10b10x2_unorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)(((uint32_t)(((uint32_t)src[2]) * 0x3ff / 0xff)) & 0x3ff) << 20;
         value |= (uint32_t)(((uint32_t)(((uint32_t)src[1]) * 0x3ff / 0xff)) & 0x3ff) << 10;
         value |= ((uint32_t)(((uint32_t)src[0]) * 0x3ff / 0xff)) & 0x3ff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= ((uint32_t)(((uint32_t)src[0]) * 0x3ff / 0xff)) & 0x3ff;
         value |= (uint32_t)(((uint32_t)(((uint32_t)src[1]) * 0x3ff / 0xff)) & 0x3ff) << 10;
         value |= (uint32_t)(((uint32_t)(((uint32_t)src[2]) * 0x3ff / 0xff)) & 0x3ff) << 20;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_b10g10r10a2_unorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t a;
         uint32_t r;
         uint32_t g;
         uint32_t b;
         a = value >> 30;
         r = (value >> 20) & 0x3ff;
         g = (value >> 10) & 0x3ff;
         b = (value) & 0x3ff;
         dst[0] = (float)(r * (1.0f/0x3ff)); /* r */
         dst[1] = (float)(g * (1.0f/0x3ff)); /* g */
         dst[2] = (float)(b * (1.0f/0x3ff)); /* b */
         dst[3] = (float)(a * (1.0f/0x3)); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t b;
         uint32_t g;
         uint32_t r;
         uint32_t a;
         b = (value) & 0x3ff;
         g = (value >> 10) & 0x3ff;
         r = (value >> 20) & 0x3ff;
         a = value >> 30;
         dst[0] = (float)(r * (1.0f/0x3ff)); /* r */
         dst[1] = (float)(g * (1.0f/0x3ff)); /* g */
         dst[2] = (float)(b * (1.0f/0x3ff)); /* b */
         dst[3] = (float)(a * (1.0f/0x3)); /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_b10g10r10a2_unorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)util_iround(CLAMP(src[3], 0.0f, 1.0f) * 0x3)) << 30;
         value |= (uint32_t)(((uint32_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0x3ff)) & 0x3ff) << 20;
         value |= (uint32_t)(((uint32_t)util_iround(CLAMP(src[1], 0.0f, 1.0f) * 0x3ff)) & 0x3ff) << 10;
         value |= ((uint32_t)util_iround(CLAMP(src[2], 0.0f, 1.0f) * 0x3ff)) & 0x3ff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= ((uint32_t)util_iround(CLAMP(src[2], 0.0f, 1.0f) * 0x3ff)) & 0x3ff;
         value |= (uint32_t)(((uint32_t)util_iround(CLAMP(src[1], 0.0f, 1.0f) * 0x3ff)) & 0x3ff) << 10;
         value |= (uint32_t)(((uint32_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0x3ff)) & 0x3ff) << 20;
         value |= (uint32_t)((uint32_t)util_iround(CLAMP(src[3], 0.0f, 1.0f) * 0x3)) << 30;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_b10g10r10a2_unorm_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t a;
         uint32_t r;
         uint32_t g;
         uint32_t b;
         a = value >> 30;
         r = (value >> 20) & 0x3ff;
         g = (value >> 10) & 0x3ff;
         b = (value) & 0x3ff;
         dst[0] = (float)(r * (1.0f/0x3ff)); /* r */
         dst[1] = (float)(g * (1.0f/0x3ff)); /* g */
         dst[2] = (float)(b * (1.0f/0x3ff)); /* b */
         dst[3] = (float)(a * (1.0f/0x3)); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t b;
         uint32_t g;
         uint32_t r;
         uint32_t a;
         b = (value) & 0x3ff;
         g = (value >> 10) & 0x3ff;
         r = (value >> 20) & 0x3ff;
         a = value >> 30;
         dst[0] = (float)(r * (1.0f/0x3ff)); /* r */
         dst[1] = (float)(g * (1.0f/0x3ff)); /* g */
         dst[2] = (float)(b * (1.0f/0x3ff)); /* b */
         dst[3] = (float)(a * (1.0f/0x3)); /* a */
#endif
}

static inline void
util_format_b10g10r10a2_unorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t a;
         uint32_t r;
         uint32_t g;
         uint32_t b;
         a = value >> 30;
         r = (value >> 20) & 0x3ff;
         g = (value >> 10) & 0x3ff;
         b = (value) & 0x3ff;
         dst[0] = (uint8_t)(r >> 2); /* r */
         dst[1] = (uint8_t)(g >> 2); /* g */
         dst[2] = (uint8_t)(b >> 2); /* b */
         dst[3] = (uint8_t)(((uint32_t)a) * 0xff / 0x3); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t b;
         uint32_t g;
         uint32_t r;
         uint32_t a;
         b = (value) & 0x3ff;
         g = (value >> 10) & 0x3ff;
         r = (value >> 20) & 0x3ff;
         a = value >> 30;
         dst[0] = (uint8_t)(r >> 2); /* r */
         dst[1] = (uint8_t)(g >> 2); /* g */
         dst[2] = (uint8_t)(b >> 2); /* b */
         dst[3] = (uint8_t)(((uint32_t)a) * 0xff / 0x3); /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_b10g10r10a2_unorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)(src[3] >> 6)) << 30;
         value |= (uint32_t)(((uint32_t)(((uint32_t)src[0]) * 0x3ff / 0xff)) & 0x3ff) << 20;
         value |= (uint32_t)(((uint32_t)(((uint32_t)src[1]) * 0x3ff / 0xff)) & 0x3ff) << 10;
         value |= ((uint32_t)(((uint32_t)src[2]) * 0x3ff / 0xff)) & 0x3ff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= ((uint32_t)(((uint32_t)src[2]) * 0x3ff / 0xff)) & 0x3ff;
         value |= (uint32_t)(((uint32_t)(((uint32_t)src[1]) * 0x3ff / 0xff)) & 0x3ff) << 10;
         value |= (uint32_t)(((uint32_t)(((uint32_t)src[0]) * 0x3ff / 0xff)) & 0x3ff) << 20;
         value |= (uint32_t)((uint32_t)(src[3] >> 6)) << 30;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_a2r10g10b10_unorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t b;
         uint32_t g;
         uint32_t r;
         uint32_t a;
         b = value >> 22;
         g = (value >> 12) & 0x3ff;
         r = (value >> 2) & 0x3ff;
         a = (value) & 0x3;
         dst[0] = (float)(r * (1.0f/0x3ff)); /* r */
         dst[1] = (float)(g * (1.0f/0x3ff)); /* g */
         dst[2] = (float)(b * (1.0f/0x3ff)); /* b */
         dst[3] = (float)(a * (1.0f/0x3)); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t a;
         uint32_t r;
         uint32_t g;
         uint32_t b;
         a = (value) & 0x3;
         r = (value >> 2) & 0x3ff;
         g = (value >> 12) & 0x3ff;
         b = value >> 22;
         dst[0] = (float)(r * (1.0f/0x3ff)); /* r */
         dst[1] = (float)(g * (1.0f/0x3ff)); /* g */
         dst[2] = (float)(b * (1.0f/0x3ff)); /* b */
         dst[3] = (float)(a * (1.0f/0x3)); /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_a2r10g10b10_unorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)util_iround(CLAMP(src[2], 0.0f, 1.0f) * 0x3ff)) << 22;
         value |= (uint32_t)(((uint32_t)util_iround(CLAMP(src[1], 0.0f, 1.0f) * 0x3ff)) & 0x3ff) << 12;
         value |= (uint32_t)(((uint32_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0x3ff)) & 0x3ff) << 2;
         value |= ((uint32_t)util_iround(CLAMP(src[3], 0.0f, 1.0f) * 0x3)) & 0x3;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= ((uint32_t)util_iround(CLAMP(src[3], 0.0f, 1.0f) * 0x3)) & 0x3;
         value |= (uint32_t)(((uint32_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0x3ff)) & 0x3ff) << 2;
         value |= (uint32_t)(((uint32_t)util_iround(CLAMP(src[1], 0.0f, 1.0f) * 0x3ff)) & 0x3ff) << 12;
         value |= (uint32_t)((uint32_t)util_iround(CLAMP(src[2], 0.0f, 1.0f) * 0x3ff)) << 22;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_a2r10g10b10_unorm_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t b;
         uint32_t g;
         uint32_t r;
         uint32_t a;
         b = value >> 22;
         g = (value >> 12) & 0x3ff;
         r = (value >> 2) & 0x3ff;
         a = (value) & 0x3;
         dst[0] = (float)(r * (1.0f/0x3ff)); /* r */
         dst[1] = (float)(g * (1.0f/0x3ff)); /* g */
         dst[2] = (float)(b * (1.0f/0x3ff)); /* b */
         dst[3] = (float)(a * (1.0f/0x3)); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t a;
         uint32_t r;
         uint32_t g;
         uint32_t b;
         a = (value) & 0x3;
         r = (value >> 2) & 0x3ff;
         g = (value >> 12) & 0x3ff;
         b = value >> 22;
         dst[0] = (float)(r * (1.0f/0x3ff)); /* r */
         dst[1] = (float)(g * (1.0f/0x3ff)); /* g */
         dst[2] = (float)(b * (1.0f/0x3ff)); /* b */
         dst[3] = (float)(a * (1.0f/0x3)); /* a */
#endif
}

static inline void
util_format_a2r10g10b10_unorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t b;
         uint32_t g;
         uint32_t r;
         uint32_t a;
         b = value >> 22;
         g = (value >> 12) & 0x3ff;
         r = (value >> 2) & 0x3ff;
         a = (value) & 0x3;
         dst[0] = (uint8_t)(r >> 2); /* r */
         dst[1] = (uint8_t)(g >> 2); /* g */
         dst[2] = (uint8_t)(b >> 2); /* b */
         dst[3] = (uint8_t)(((uint32_t)a) * 0xff / 0x3); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t a;
         uint32_t r;
         uint32_t g;
         uint32_t b;
         a = (value) & 0x3;
         r = (value >> 2) & 0x3ff;
         g = (value >> 12) & 0x3ff;
         b = value >> 22;
         dst[0] = (uint8_t)(r >> 2); /* r */
         dst[1] = (uint8_t)(g >> 2); /* g */
         dst[2] = (uint8_t)(b >> 2); /* b */
         dst[3] = (uint8_t)(((uint32_t)a) * 0xff / 0x3); /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_a2r10g10b10_unorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)(((uint32_t)src[2]) * 0x3ff / 0xff)) << 22;
         value |= (uint32_t)(((uint32_t)(((uint32_t)src[1]) * 0x3ff / 0xff)) & 0x3ff) << 12;
         value |= (uint32_t)(((uint32_t)(((uint32_t)src[0]) * 0x3ff / 0xff)) & 0x3ff) << 2;
         value |= ((uint32_t)(src[3] >> 6)) & 0x3;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= ((uint32_t)(src[3] >> 6)) & 0x3;
         value |= (uint32_t)(((uint32_t)(((uint32_t)src[0]) * 0x3ff / 0xff)) & 0x3ff) << 2;
         value |= (uint32_t)(((uint32_t)(((uint32_t)src[1]) * 0x3ff / 0xff)) & 0x3ff) << 12;
         value |= (uint32_t)((uint32_t)(((uint32_t)src[2]) * 0x3ff / 0xff)) << 22;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_a2b10g10r10_unorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t r;
         uint32_t g;
         uint32_t b;
         uint32_t a;
         r = value >> 22;
         g = (value >> 12) & 0x3ff;
         b = (value >> 2) & 0x3ff;
         a = (value) & 0x3;
         dst[0] = (float)(r * (1.0f/0x3ff)); /* r */
         dst[1] = (float)(g * (1.0f/0x3ff)); /* g */
         dst[2] = (float)(b * (1.0f/0x3ff)); /* b */
         dst[3] = (float)(a * (1.0f/0x3)); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t a;
         uint32_t b;
         uint32_t g;
         uint32_t r;
         a = (value) & 0x3;
         b = (value >> 2) & 0x3ff;
         g = (value >> 12) & 0x3ff;
         r = value >> 22;
         dst[0] = (float)(r * (1.0f/0x3ff)); /* r */
         dst[1] = (float)(g * (1.0f/0x3ff)); /* g */
         dst[2] = (float)(b * (1.0f/0x3ff)); /* b */
         dst[3] = (float)(a * (1.0f/0x3)); /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_a2b10g10r10_unorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0x3ff)) << 22;
         value |= (uint32_t)(((uint32_t)util_iround(CLAMP(src[1], 0.0f, 1.0f) * 0x3ff)) & 0x3ff) << 12;
         value |= (uint32_t)(((uint32_t)util_iround(CLAMP(src[2], 0.0f, 1.0f) * 0x3ff)) & 0x3ff) << 2;
         value |= ((uint32_t)util_iround(CLAMP(src[3], 0.0f, 1.0f) * 0x3)) & 0x3;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= ((uint32_t)util_iround(CLAMP(src[3], 0.0f, 1.0f) * 0x3)) & 0x3;
         value |= (uint32_t)(((uint32_t)util_iround(CLAMP(src[2], 0.0f, 1.0f) * 0x3ff)) & 0x3ff) << 2;
         value |= (uint32_t)(((uint32_t)util_iround(CLAMP(src[1], 0.0f, 1.0f) * 0x3ff)) & 0x3ff) << 12;
         value |= (uint32_t)((uint32_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0x3ff)) << 22;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_a2b10g10r10_unorm_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t r;
         uint32_t g;
         uint32_t b;
         uint32_t a;
         r = value >> 22;
         g = (value >> 12) & 0x3ff;
         b = (value >> 2) & 0x3ff;
         a = (value) & 0x3;
         dst[0] = (float)(r * (1.0f/0x3ff)); /* r */
         dst[1] = (float)(g * (1.0f/0x3ff)); /* g */
         dst[2] = (float)(b * (1.0f/0x3ff)); /* b */
         dst[3] = (float)(a * (1.0f/0x3)); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t a;
         uint32_t b;
         uint32_t g;
         uint32_t r;
         a = (value) & 0x3;
         b = (value >> 2) & 0x3ff;
         g = (value >> 12) & 0x3ff;
         r = value >> 22;
         dst[0] = (float)(r * (1.0f/0x3ff)); /* r */
         dst[1] = (float)(g * (1.0f/0x3ff)); /* g */
         dst[2] = (float)(b * (1.0f/0x3ff)); /* b */
         dst[3] = (float)(a * (1.0f/0x3)); /* a */
#endif
}

static inline void
util_format_a2b10g10r10_unorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t r;
         uint32_t g;
         uint32_t b;
         uint32_t a;
         r = value >> 22;
         g = (value >> 12) & 0x3ff;
         b = (value >> 2) & 0x3ff;
         a = (value) & 0x3;
         dst[0] = (uint8_t)(r >> 2); /* r */
         dst[1] = (uint8_t)(g >> 2); /* g */
         dst[2] = (uint8_t)(b >> 2); /* b */
         dst[3] = (uint8_t)(((uint32_t)a) * 0xff / 0x3); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t a;
         uint32_t b;
         uint32_t g;
         uint32_t r;
         a = (value) & 0x3;
         b = (value >> 2) & 0x3ff;
         g = (value >> 12) & 0x3ff;
         r = value >> 22;
         dst[0] = (uint8_t)(r >> 2); /* r */
         dst[1] = (uint8_t)(g >> 2); /* g */
         dst[2] = (uint8_t)(b >> 2); /* b */
         dst[3] = (uint8_t)(((uint32_t)a) * 0xff / 0x3); /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_a2b10g10r10_unorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)(((uint32_t)src[0]) * 0x3ff / 0xff)) << 22;
         value |= (uint32_t)(((uint32_t)(((uint32_t)src[1]) * 0x3ff / 0xff)) & 0x3ff) << 12;
         value |= (uint32_t)(((uint32_t)(((uint32_t)src[2]) * 0x3ff / 0xff)) & 0x3ff) << 2;
         value |= ((uint32_t)(src[3] >> 6)) & 0x3;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= ((uint32_t)(src[3] >> 6)) & 0x3;
         value |= (uint32_t)(((uint32_t)(((uint32_t)src[2]) * 0x3ff / 0xff)) & 0x3ff) << 2;
         value |= (uint32_t)(((uint32_t)(((uint32_t)src[1]) * 0x3ff / 0xff)) & 0x3ff) << 12;
         value |= (uint32_t)((uint32_t)(((uint32_t)src[0]) * 0x3ff / 0xff)) << 22;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r3g3b2_unorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint8_t value = *(const uint8_t *)src;
         uint8_t b;
         uint8_t g;
         uint8_t r;
         b = value >> 5;
         g = (value >> 2) & 0x7;
         r = (value) & 0x3;
         dst[0] = (float)(r * (1.0f/0x3)); /* r */
         dst[1] = (float)(g * (1.0f/0x7)); /* g */
         dst[2] = (float)(b * (1.0f/0x7)); /* b */
         dst[3] = 1; /* a */
#else
         uint8_t value = *(const uint8_t *)src;
         uint8_t r;
         uint8_t g;
         uint8_t b;
         r = (value) & 0x7;
         g = (value >> 3) & 0x7;
         b = value >> 6;
         dst[0] = (float)(r * (1.0f/0x7)); /* r */
         dst[1] = (float)(g * (1.0f/0x7)); /* g */
         dst[2] = (float)(b * (1.0f/0x3)); /* b */
         dst[3] = 1; /* a */
#endif
         src += 1;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r3g3b2_unorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint8_t value = 0;
         value |= (uint32_t)((uint8_t)util_iround(CLAMP(src[2], 0.0f, 1.0f) * 0x7)) << 5;
         value |= (uint32_t)(((uint8_t)util_iround(CLAMP(src[1], 0.0f, 1.0f) * 0x7)) & 0x7) << 2;
         value |= ((uint8_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0x3)) & 0x3;
         *(uint8_t *)dst = value;
#else
         uint8_t value = 0;
         value |= ((uint8_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0x7)) & 0x7;
         value |= (uint32_t)(((uint8_t)util_iround(CLAMP(src[1], 0.0f, 1.0f) * 0x7)) & 0x7) << 3;
         value |= (uint32_t)((uint8_t)util_iround(CLAMP(src[2], 0.0f, 1.0f) * 0x3)) << 6;
         *(uint8_t *)dst = value;
#endif
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r3g3b2_unorm_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         uint8_t value = *(const uint8_t *)src;
         uint8_t b;
         uint8_t g;
         uint8_t r;
         b = value >> 5;
         g = (value >> 2) & 0x7;
         r = (value) & 0x3;
         dst[0] = (float)(r * (1.0f/0x3)); /* r */
         dst[1] = (float)(g * (1.0f/0x7)); /* g */
         dst[2] = (float)(b * (1.0f/0x7)); /* b */
         dst[3] = 1; /* a */
#else
         uint8_t value = *(const uint8_t *)src;
         uint8_t r;
         uint8_t g;
         uint8_t b;
         r = (value) & 0x7;
         g = (value >> 3) & 0x7;
         b = value >> 6;
         dst[0] = (float)(r * (1.0f/0x7)); /* r */
         dst[1] = (float)(g * (1.0f/0x7)); /* g */
         dst[2] = (float)(b * (1.0f/0x3)); /* b */
         dst[3] = 1; /* a */
#endif
}

static inline void
util_format_r3g3b2_unorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint8_t value = *(const uint8_t *)src;
         uint8_t b;
         uint8_t g;
         uint8_t r;
         b = value >> 5;
         g = (value >> 2) & 0x7;
         r = (value) & 0x3;
         dst[0] = (uint8_t)(((uint32_t)r) * 0xff / 0x3); /* r */
         dst[1] = (uint8_t)(((uint32_t)g) * 0xff / 0x7); /* g */
         dst[2] = (uint8_t)(((uint32_t)b) * 0xff / 0x7); /* b */
         dst[3] = 255; /* a */
#else
         uint8_t value = *(const uint8_t *)src;
         uint8_t r;
         uint8_t g;
         uint8_t b;
         r = (value) & 0x7;
         g = (value >> 3) & 0x7;
         b = value >> 6;
         dst[0] = (uint8_t)(((uint32_t)r) * 0xff / 0x7); /* r */
         dst[1] = (uint8_t)(((uint32_t)g) * 0xff / 0x7); /* g */
         dst[2] = (uint8_t)(((uint32_t)b) * 0xff / 0x3); /* b */
         dst[3] = 255; /* a */
#endif
         src += 1;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r3g3b2_unorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint8_t value = 0;
         value |= (uint32_t)((uint8_t)(src[2] >> 5)) << 5;
         value |= (uint32_t)(((uint8_t)(src[1] >> 5)) & 0x7) << 2;
         value |= ((uint8_t)(src[0] >> 6)) & 0x3;
         *(uint8_t *)dst = value;
#else
         uint8_t value = 0;
         value |= ((uint8_t)(src[0] >> 5)) & 0x7;
         value |= (uint32_t)(((uint8_t)(src[1] >> 5)) & 0x7) << 3;
         value |= (uint32_t)((uint8_t)(src[2] >> 6)) << 6;
         *(uint8_t *)dst = value;
#endif
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_b2g3r3_unorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint8_t value = *(const uint8_t *)src;
         uint8_t r;
         uint8_t g;
         uint8_t b;
         r = value >> 5;
         g = (value >> 2) & 0x7;
         b = (value) & 0x3;
         dst[0] = (float)(r * (1.0f/0x7)); /* r */
         dst[1] = (float)(g * (1.0f/0x7)); /* g */
         dst[2] = (float)(b * (1.0f/0x3)); /* b */
         dst[3] = 1; /* a */
#else
         uint8_t value = *(const uint8_t *)src;
         uint8_t b;
         uint8_t g;
         uint8_t r;
         b = (value) & 0x3;
         g = (value >> 2) & 0x7;
         r = value >> 5;
         dst[0] = (float)(r * (1.0f/0x7)); /* r */
         dst[1] = (float)(g * (1.0f/0x7)); /* g */
         dst[2] = (float)(b * (1.0f/0x3)); /* b */
         dst[3] = 1; /* a */
#endif
         src += 1;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_b2g3r3_unorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint8_t value = 0;
         value |= (uint32_t)((uint8_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0x7)) << 5;
         value |= (uint32_t)(((uint8_t)util_iround(CLAMP(src[1], 0.0f, 1.0f) * 0x7)) & 0x7) << 2;
         value |= ((uint8_t)util_iround(CLAMP(src[2], 0.0f, 1.0f) * 0x3)) & 0x3;
         *(uint8_t *)dst = value;
#else
         uint8_t value = 0;
         value |= ((uint8_t)util_iround(CLAMP(src[2], 0.0f, 1.0f) * 0x3)) & 0x3;
         value |= (uint32_t)(((uint8_t)util_iround(CLAMP(src[1], 0.0f, 1.0f) * 0x7)) & 0x7) << 2;
         value |= (uint32_t)((uint8_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0x7)) << 5;
         *(uint8_t *)dst = value;
#endif
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_b2g3r3_unorm_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         uint8_t value = *(const uint8_t *)src;
         uint8_t r;
         uint8_t g;
         uint8_t b;
         r = value >> 5;
         g = (value >> 2) & 0x7;
         b = (value) & 0x3;
         dst[0] = (float)(r * (1.0f/0x7)); /* r */
         dst[1] = (float)(g * (1.0f/0x7)); /* g */
         dst[2] = (float)(b * (1.0f/0x3)); /* b */
         dst[3] = 1; /* a */
#else
         uint8_t value = *(const uint8_t *)src;
         uint8_t b;
         uint8_t g;
         uint8_t r;
         b = (value) & 0x3;
         g = (value >> 2) & 0x7;
         r = value >> 5;
         dst[0] = (float)(r * (1.0f/0x7)); /* r */
         dst[1] = (float)(g * (1.0f/0x7)); /* g */
         dst[2] = (float)(b * (1.0f/0x3)); /* b */
         dst[3] = 1; /* a */
#endif
}

static inline void
util_format_b2g3r3_unorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint8_t value = *(const uint8_t *)src;
         uint8_t r;
         uint8_t g;
         uint8_t b;
         r = value >> 5;
         g = (value >> 2) & 0x7;
         b = (value) & 0x3;
         dst[0] = (uint8_t)(((uint32_t)r) * 0xff / 0x7); /* r */
         dst[1] = (uint8_t)(((uint32_t)g) * 0xff / 0x7); /* g */
         dst[2] = (uint8_t)(((uint32_t)b) * 0xff / 0x3); /* b */
         dst[3] = 255; /* a */
#else
         uint8_t value = *(const uint8_t *)src;
         uint8_t b;
         uint8_t g;
         uint8_t r;
         b = (value) & 0x3;
         g = (value >> 2) & 0x7;
         r = value >> 5;
         dst[0] = (uint8_t)(((uint32_t)r) * 0xff / 0x7); /* r */
         dst[1] = (uint8_t)(((uint32_t)g) * 0xff / 0x7); /* g */
         dst[2] = (uint8_t)(((uint32_t)b) * 0xff / 0x3); /* b */
         dst[3] = 255; /* a */
#endif
         src += 1;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_b2g3r3_unorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint8_t value = 0;
         value |= (uint32_t)((uint8_t)(src[0] >> 5)) << 5;
         value |= (uint32_t)(((uint8_t)(src[1] >> 5)) & 0x7) << 2;
         value |= ((uint8_t)(src[2] >> 6)) & 0x3;
         *(uint8_t *)dst = value;
#else
         uint8_t value = 0;
         value |= ((uint8_t)(src[2] >> 6)) & 0x3;
         value |= (uint32_t)(((uint8_t)(src[1] >> 5)) & 0x7) << 2;
         value |= (uint32_t)((uint8_t)(src[0] >> 5)) << 5;
         *(uint8_t *)dst = value;
#endif
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_l8_unorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = *(const uint8_t *)src;
         uint8_t rgb;
         rgb = value;
         dst[0] = ubyte_to_float(rgb); /* r */
         dst[1] = ubyte_to_float(rgb); /* g */
         dst[2] = ubyte_to_float(rgb); /* b */
         dst[3] = 1; /* a */
         src += 1;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_l8_unorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = 0;
         value |= float_to_ubyte(src[0]);
         *(uint8_t *)dst = value;
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_l8_unorm_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
         uint8_t value = *(const uint8_t *)src;
         uint8_t rgb;
         rgb = value;
         dst[0] = ubyte_to_float(rgb); /* r */
         dst[1] = ubyte_to_float(rgb); /* g */
         dst[2] = ubyte_to_float(rgb); /* b */
         dst[3] = 1; /* a */
}

static inline void
util_format_l8_unorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = *(const uint8_t *)src;
         uint8_t rgb;
         rgb = value;
         dst[0] = rgb; /* r */
         dst[1] = rgb; /* g */
         dst[2] = rgb; /* b */
         dst[3] = 255; /* a */
         src += 1;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_l8_unorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = 0;
         value |= src[0];
         *(uint8_t *)dst = value;
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_a8_unorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = *(const uint8_t *)src;
         uint8_t a;
         a = value;
         dst[0] = 0; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = ubyte_to_float(a); /* a */
         src += 1;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_a8_unorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = 0;
         value |= float_to_ubyte(src[3]);
         *(uint8_t *)dst = value;
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_a8_unorm_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
         uint8_t value = *(const uint8_t *)src;
         uint8_t a;
         a = value;
         dst[0] = 0; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = ubyte_to_float(a); /* a */
}

static inline void
util_format_a8_unorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = *(const uint8_t *)src;
         uint8_t a;
         a = value;
         dst[0] = 0; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = a; /* a */
         src += 1;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_a8_unorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = 0;
         value |= src[3];
         *(uint8_t *)dst = value;
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_i8_unorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = *(const uint8_t *)src;
         uint8_t rgba;
         rgba = value;
         dst[0] = ubyte_to_float(rgba); /* r */
         dst[1] = ubyte_to_float(rgba); /* g */
         dst[2] = ubyte_to_float(rgba); /* b */
         dst[3] = ubyte_to_float(rgba); /* a */
         src += 1;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_i8_unorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = 0;
         value |= float_to_ubyte(src[0]);
         *(uint8_t *)dst = value;
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_i8_unorm_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
         uint8_t value = *(const uint8_t *)src;
         uint8_t rgba;
         rgba = value;
         dst[0] = ubyte_to_float(rgba); /* r */
         dst[1] = ubyte_to_float(rgba); /* g */
         dst[2] = ubyte_to_float(rgba); /* b */
         dst[3] = ubyte_to_float(rgba); /* a */
}

static inline void
util_format_i8_unorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = *(const uint8_t *)src;
         uint8_t rgba;
         rgba = value;
         dst[0] = rgba; /* r */
         dst[1] = rgba; /* g */
         dst[2] = rgba; /* b */
         dst[3] = rgba; /* a */
         src += 1;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_i8_unorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = 0;
         value |= src[0];
         *(uint8_t *)dst = value;
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_l4a4_unorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint8_t value = *(const uint8_t *)src;
         uint8_t a;
         uint8_t rgb;
         a = value >> 4;
         rgb = (value) & 0xf;
         dst[0] = (float)(rgb * (1.0f/0xf)); /* r */
         dst[1] = (float)(rgb * (1.0f/0xf)); /* g */
         dst[2] = (float)(rgb * (1.0f/0xf)); /* b */
         dst[3] = (float)(a * (1.0f/0xf)); /* a */
#else
         uint8_t value = *(const uint8_t *)src;
         uint8_t rgb;
         uint8_t a;
         rgb = (value) & 0xf;
         a = value >> 4;
         dst[0] = (float)(rgb * (1.0f/0xf)); /* r */
         dst[1] = (float)(rgb * (1.0f/0xf)); /* g */
         dst[2] = (float)(rgb * (1.0f/0xf)); /* b */
         dst[3] = (float)(a * (1.0f/0xf)); /* a */
#endif
         src += 1;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_l4a4_unorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint8_t value = 0;
         value |= (uint32_t)((uint8_t)util_iround(CLAMP(src[3], 0.0f, 1.0f) * 0xf)) << 4;
         value |= ((uint8_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0xf)) & 0xf;
         *(uint8_t *)dst = value;
#else
         uint8_t value = 0;
         value |= ((uint8_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0xf)) & 0xf;
         value |= (uint32_t)((uint8_t)util_iround(CLAMP(src[3], 0.0f, 1.0f) * 0xf)) << 4;
         *(uint8_t *)dst = value;
#endif
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_l4a4_unorm_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         uint8_t value = *(const uint8_t *)src;
         uint8_t a;
         uint8_t rgb;
         a = value >> 4;
         rgb = (value) & 0xf;
         dst[0] = (float)(rgb * (1.0f/0xf)); /* r */
         dst[1] = (float)(rgb * (1.0f/0xf)); /* g */
         dst[2] = (float)(rgb * (1.0f/0xf)); /* b */
         dst[3] = (float)(a * (1.0f/0xf)); /* a */
#else
         uint8_t value = *(const uint8_t *)src;
         uint8_t rgb;
         uint8_t a;
         rgb = (value) & 0xf;
         a = value >> 4;
         dst[0] = (float)(rgb * (1.0f/0xf)); /* r */
         dst[1] = (float)(rgb * (1.0f/0xf)); /* g */
         dst[2] = (float)(rgb * (1.0f/0xf)); /* b */
         dst[3] = (float)(a * (1.0f/0xf)); /* a */
#endif
}

static inline void
util_format_l4a4_unorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint8_t value = *(const uint8_t *)src;
         uint8_t a;
         uint8_t rgb;
         a = value >> 4;
         rgb = (value) & 0xf;
         dst[0] = (uint8_t)(((uint32_t)rgb) * 0xff / 0xf); /* r */
         dst[1] = (uint8_t)(((uint32_t)rgb) * 0xff / 0xf); /* g */
         dst[2] = (uint8_t)(((uint32_t)rgb) * 0xff / 0xf); /* b */
         dst[3] = (uint8_t)(((uint32_t)a) * 0xff / 0xf); /* a */
#else
         uint8_t value = *(const uint8_t *)src;
         uint8_t rgb;
         uint8_t a;
         rgb = (value) & 0xf;
         a = value >> 4;
         dst[0] = (uint8_t)(((uint32_t)rgb) * 0xff / 0xf); /* r */
         dst[1] = (uint8_t)(((uint32_t)rgb) * 0xff / 0xf); /* g */
         dst[2] = (uint8_t)(((uint32_t)rgb) * 0xff / 0xf); /* b */
         dst[3] = (uint8_t)(((uint32_t)a) * 0xff / 0xf); /* a */
#endif
         src += 1;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_l4a4_unorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint8_t value = 0;
         value |= (uint32_t)((uint8_t)(src[3] >> 4)) << 4;
         value |= ((uint8_t)(src[0] >> 4)) & 0xf;
         *(uint8_t *)dst = value;
#else
         uint8_t value = 0;
         value |= ((uint8_t)(src[0] >> 4)) & 0xf;
         value |= (uint32_t)((uint8_t)(src[3] >> 4)) << 4;
         *(uint8_t *)dst = value;
#endif
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_l8a8_unorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t rgb;
         uint16_t a;
         rgb = value >> 8;
         a = (value) & 0xff;
         dst[0] = ubyte_to_float(rgb); /* r */
         dst[1] = ubyte_to_float(rgb); /* g */
         dst[2] = ubyte_to_float(rgb); /* b */
         dst[3] = ubyte_to_float(a); /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t rgb;
         uint16_t a;
         rgb = (value) & 0xff;
         a = value >> 8;
         dst[0] = ubyte_to_float(rgb); /* r */
         dst[1] = ubyte_to_float(rgb); /* g */
         dst[2] = ubyte_to_float(rgb); /* b */
         dst[3] = ubyte_to_float(a); /* a */
#endif
         src += 2;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_l8a8_unorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)(float_to_ubyte(src[0])) << 8;
         value |= (float_to_ubyte(src[3])) & 0xff;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= (float_to_ubyte(src[0])) & 0xff;
         value |= (uint32_t)(float_to_ubyte(src[3])) << 8;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_l8a8_unorm_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t rgb;
         uint16_t a;
         rgb = value >> 8;
         a = (value) & 0xff;
         dst[0] = ubyte_to_float(rgb); /* r */
         dst[1] = ubyte_to_float(rgb); /* g */
         dst[2] = ubyte_to_float(rgb); /* b */
         dst[3] = ubyte_to_float(a); /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t rgb;
         uint16_t a;
         rgb = (value) & 0xff;
         a = value >> 8;
         dst[0] = ubyte_to_float(rgb); /* r */
         dst[1] = ubyte_to_float(rgb); /* g */
         dst[2] = ubyte_to_float(rgb); /* b */
         dst[3] = ubyte_to_float(a); /* a */
#endif
}

static inline void
util_format_l8a8_unorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t rgb;
         uint16_t a;
         rgb = value >> 8;
         a = (value) & 0xff;
         dst[0] = rgb; /* r */
         dst[1] = rgb; /* g */
         dst[2] = rgb; /* b */
         dst[3] = a; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t rgb;
         uint16_t a;
         rgb = (value) & 0xff;
         a = value >> 8;
         dst[0] = rgb; /* r */
         dst[1] = rgb; /* g */
         dst[2] = rgb; /* b */
         dst[3] = a; /* a */
#endif
         src += 2;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_l8a8_unorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)(src[0]) << 8;
         value |= (src[3]) & 0xff;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= (src[0]) & 0xff;
         value |= (uint32_t)(src[3]) << 8;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_l16_unorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = *(const uint16_t *)src;
         uint16_t rgb;
         rgb = value;
         dst[0] = (float)(rgb * (1.0f/0xffff)); /* r */
         dst[1] = (float)(rgb * (1.0f/0xffff)); /* g */
         dst[2] = (float)(rgb * (1.0f/0xffff)); /* b */
         dst[3] = 1; /* a */
         src += 2;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_l16_unorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = 0;
         value |= (uint16_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0xffff);
         *(uint16_t *)dst = value;
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_l16_unorm_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
         uint16_t value = *(const uint16_t *)src;
         uint16_t rgb;
         rgb = value;
         dst[0] = (float)(rgb * (1.0f/0xffff)); /* r */
         dst[1] = (float)(rgb * (1.0f/0xffff)); /* g */
         dst[2] = (float)(rgb * (1.0f/0xffff)); /* b */
         dst[3] = 1; /* a */
}

static inline void
util_format_l16_unorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = *(const uint16_t *)src;
         uint16_t rgb;
         rgb = value;
         dst[0] = (uint8_t)(rgb >> 8); /* r */
         dst[1] = (uint8_t)(rgb >> 8); /* g */
         dst[2] = (uint8_t)(rgb >> 8); /* b */
         dst[3] = 255; /* a */
         src += 2;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_l16_unorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = 0;
         value |= (uint16_t)(((uint32_t)src[0]) * 0xffff / 0xff);
         *(uint16_t *)dst = value;
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_a16_unorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = *(const uint16_t *)src;
         uint16_t a;
         a = value;
         dst[0] = 0; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = (float)(a * (1.0f/0xffff)); /* a */
         src += 2;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_a16_unorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = 0;
         value |= (uint16_t)util_iround(CLAMP(src[3], 0.0f, 1.0f) * 0xffff);
         *(uint16_t *)dst = value;
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_a16_unorm_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
         uint16_t value = *(const uint16_t *)src;
         uint16_t a;
         a = value;
         dst[0] = 0; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = (float)(a * (1.0f/0xffff)); /* a */
}

static inline void
util_format_a16_unorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = *(const uint16_t *)src;
         uint16_t a;
         a = value;
         dst[0] = 0; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = (uint8_t)(a >> 8); /* a */
         src += 2;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_a16_unorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = 0;
         value |= (uint16_t)(((uint32_t)src[3]) * 0xffff / 0xff);
         *(uint16_t *)dst = value;
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_i16_unorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = *(const uint16_t *)src;
         uint16_t rgba;
         rgba = value;
         dst[0] = (float)(rgba * (1.0f/0xffff)); /* r */
         dst[1] = (float)(rgba * (1.0f/0xffff)); /* g */
         dst[2] = (float)(rgba * (1.0f/0xffff)); /* b */
         dst[3] = (float)(rgba * (1.0f/0xffff)); /* a */
         src += 2;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_i16_unorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = 0;
         value |= (uint16_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0xffff);
         *(uint16_t *)dst = value;
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_i16_unorm_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
         uint16_t value = *(const uint16_t *)src;
         uint16_t rgba;
         rgba = value;
         dst[0] = (float)(rgba * (1.0f/0xffff)); /* r */
         dst[1] = (float)(rgba * (1.0f/0xffff)); /* g */
         dst[2] = (float)(rgba * (1.0f/0xffff)); /* b */
         dst[3] = (float)(rgba * (1.0f/0xffff)); /* a */
}

static inline void
util_format_i16_unorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = *(const uint16_t *)src;
         uint16_t rgba;
         rgba = value;
         dst[0] = (uint8_t)(rgba >> 8); /* r */
         dst[1] = (uint8_t)(rgba >> 8); /* g */
         dst[2] = (uint8_t)(rgba >> 8); /* b */
         dst[3] = (uint8_t)(rgba >> 8); /* a */
         src += 2;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_i16_unorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = 0;
         value |= (uint16_t)(((uint32_t)src[0]) * 0xffff / 0xff);
         *(uint16_t *)dst = value;
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_l16a16_unorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t rgb;
         uint32_t a;
         rgb = value >> 16;
         a = (value) & 0xffff;
         dst[0] = (float)(rgb * (1.0f/0xffff)); /* r */
         dst[1] = (float)(rgb * (1.0f/0xffff)); /* g */
         dst[2] = (float)(rgb * (1.0f/0xffff)); /* b */
         dst[3] = (float)(a * (1.0f/0xffff)); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t rgb;
         uint32_t a;
         rgb = (value) & 0xffff;
         a = value >> 16;
         dst[0] = (float)(rgb * (1.0f/0xffff)); /* r */
         dst[1] = (float)(rgb * (1.0f/0xffff)); /* g */
         dst[2] = (float)(rgb * (1.0f/0xffff)); /* b */
         dst[3] = (float)(a * (1.0f/0xffff)); /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_l16a16_unorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint16_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0xffff)) << 16;
         value |= ((uint16_t)util_iround(CLAMP(src[3], 0.0f, 1.0f) * 0xffff)) & 0xffff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= ((uint16_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0xffff)) & 0xffff;
         value |= (uint32_t)((uint16_t)util_iround(CLAMP(src[3], 0.0f, 1.0f) * 0xffff)) << 16;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_l16a16_unorm_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t rgb;
         uint32_t a;
         rgb = value >> 16;
         a = (value) & 0xffff;
         dst[0] = (float)(rgb * (1.0f/0xffff)); /* r */
         dst[1] = (float)(rgb * (1.0f/0xffff)); /* g */
         dst[2] = (float)(rgb * (1.0f/0xffff)); /* b */
         dst[3] = (float)(a * (1.0f/0xffff)); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t rgb;
         uint32_t a;
         rgb = (value) & 0xffff;
         a = value >> 16;
         dst[0] = (float)(rgb * (1.0f/0xffff)); /* r */
         dst[1] = (float)(rgb * (1.0f/0xffff)); /* g */
         dst[2] = (float)(rgb * (1.0f/0xffff)); /* b */
         dst[3] = (float)(a * (1.0f/0xffff)); /* a */
#endif
}

static inline void
util_format_l16a16_unorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t rgb;
         uint32_t a;
         rgb = value >> 16;
         a = (value) & 0xffff;
         dst[0] = (uint8_t)(rgb >> 8); /* r */
         dst[1] = (uint8_t)(rgb >> 8); /* g */
         dst[2] = (uint8_t)(rgb >> 8); /* b */
         dst[3] = (uint8_t)(a >> 8); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t rgb;
         uint32_t a;
         rgb = (value) & 0xffff;
         a = value >> 16;
         dst[0] = (uint8_t)(rgb >> 8); /* r */
         dst[1] = (uint8_t)(rgb >> 8); /* g */
         dst[2] = (uint8_t)(rgb >> 8); /* b */
         dst[3] = (uint8_t)(a >> 8); /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_l16a16_unorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint16_t)(((uint32_t)src[0]) * 0xffff / 0xff)) << 16;
         value |= ((uint16_t)(((uint32_t)src[3]) * 0xffff / 0xff)) & 0xffff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= ((uint16_t)(((uint32_t)src[0]) * 0xffff / 0xff)) & 0xffff;
         value |= (uint32_t)((uint16_t)(((uint32_t)src[3]) * 0xffff / 0xff)) << 16;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_a8_snorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = *(const uint8_t *)src;
         int8_t a;
         a = (int8_t)(value) ;
         dst[0] = 0; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = (float)(a * (1.0f/0x7f)); /* a */
         src += 1;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_a8_snorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = 0;
         value |= (uint8_t)((int8_t)util_iround(CLAMP(src[3], -1.0f, 1.0f) * 0x7f)) ;
         *(uint8_t *)dst = value;
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_a8_snorm_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
         uint8_t value = *(const uint8_t *)src;
         int8_t a;
         a = (int8_t)(value) ;
         dst[0] = 0; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = (float)(a * (1.0f/0x7f)); /* a */
}

static inline void
util_format_a8_snorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = *(const uint8_t *)src;
         int8_t a;
         a = (int8_t)(value) ;
         dst[0] = 0; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = (uint8_t)(((uint32_t)MAX2(a, 0)) * 0xff / 0x7f); /* a */
         src += 1;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_a8_snorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = 0;
         value |= (uint8_t)((int8_t)(src[3] >> 1)) ;
         *(uint8_t *)dst = value;
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_l8_snorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = *(const uint8_t *)src;
         int8_t rgb;
         rgb = (int8_t)(value) ;
         dst[0] = (float)(rgb * (1.0f/0x7f)); /* r */
         dst[1] = (float)(rgb * (1.0f/0x7f)); /* g */
         dst[2] = (float)(rgb * (1.0f/0x7f)); /* b */
         dst[3] = 1; /* a */
         src += 1;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_l8_snorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = 0;
         value |= (uint8_t)((int8_t)util_iround(CLAMP(src[0], -1.0f, 1.0f) * 0x7f)) ;
         *(uint8_t *)dst = value;
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_l8_snorm_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
         uint8_t value = *(const uint8_t *)src;
         int8_t rgb;
         rgb = (int8_t)(value) ;
         dst[0] = (float)(rgb * (1.0f/0x7f)); /* r */
         dst[1] = (float)(rgb * (1.0f/0x7f)); /* g */
         dst[2] = (float)(rgb * (1.0f/0x7f)); /* b */
         dst[3] = 1; /* a */
}

static inline void
util_format_l8_snorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = *(const uint8_t *)src;
         int8_t rgb;
         rgb = (int8_t)(value) ;
         dst[0] = (uint8_t)(((uint32_t)MAX2(rgb, 0)) * 0xff / 0x7f); /* r */
         dst[1] = (uint8_t)(((uint32_t)MAX2(rgb, 0)) * 0xff / 0x7f); /* g */
         dst[2] = (uint8_t)(((uint32_t)MAX2(rgb, 0)) * 0xff / 0x7f); /* b */
         dst[3] = 255; /* a */
         src += 1;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_l8_snorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = 0;
         value |= (uint8_t)((int8_t)(src[0] >> 1)) ;
         *(uint8_t *)dst = value;
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_l8a8_snorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         int16_t rgb;
         int16_t a;
         rgb = ((int16_t)(value) ) >> 8;
         a = ((int16_t)(value << 8) ) >> 8;
         dst[0] = (float)(rgb * (1.0f/0x7f)); /* r */
         dst[1] = (float)(rgb * (1.0f/0x7f)); /* g */
         dst[2] = (float)(rgb * (1.0f/0x7f)); /* b */
         dst[3] = (float)(a * (1.0f/0x7f)); /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         int16_t rgb;
         int16_t a;
         rgb = ((int16_t)(value << 8) ) >> 8;
         a = ((int16_t)(value) ) >> 8;
         dst[0] = (float)(rgb * (1.0f/0x7f)); /* r */
         dst[1] = (float)(rgb * (1.0f/0x7f)); /* g */
         dst[2] = (float)(rgb * (1.0f/0x7f)); /* b */
         dst[3] = (float)(a * (1.0f/0x7f)); /* a */
#endif
         src += 2;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_l8a8_snorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint16_t)((uint32_t)((int8_t)util_iround(CLAMP(src[0], -1.0f, 1.0f) * 0x7f)) << 8) ;
         value |= (uint16_t)(((int8_t)util_iround(CLAMP(src[3], -1.0f, 1.0f) * 0x7f)) & 0xff) ;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= (uint16_t)(((int8_t)util_iround(CLAMP(src[0], -1.0f, 1.0f) * 0x7f)) & 0xff) ;
         value |= (uint16_t)((uint32_t)((int8_t)util_iround(CLAMP(src[3], -1.0f, 1.0f) * 0x7f)) << 8) ;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_l8a8_snorm_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         int16_t rgb;
         int16_t a;
         rgb = ((int16_t)(value) ) >> 8;
         a = ((int16_t)(value << 8) ) >> 8;
         dst[0] = (float)(rgb * (1.0f/0x7f)); /* r */
         dst[1] = (float)(rgb * (1.0f/0x7f)); /* g */
         dst[2] = (float)(rgb * (1.0f/0x7f)); /* b */
         dst[3] = (float)(a * (1.0f/0x7f)); /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         int16_t rgb;
         int16_t a;
         rgb = ((int16_t)(value << 8) ) >> 8;
         a = ((int16_t)(value) ) >> 8;
         dst[0] = (float)(rgb * (1.0f/0x7f)); /* r */
         dst[1] = (float)(rgb * (1.0f/0x7f)); /* g */
         dst[2] = (float)(rgb * (1.0f/0x7f)); /* b */
         dst[3] = (float)(a * (1.0f/0x7f)); /* a */
#endif
}

static inline void
util_format_l8a8_snorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         int16_t rgb;
         int16_t a;
         rgb = ((int16_t)(value) ) >> 8;
         a = ((int16_t)(value << 8) ) >> 8;
         dst[0] = (uint8_t)(((uint32_t)MAX2(rgb, 0)) * 0xff / 0x7f); /* r */
         dst[1] = (uint8_t)(((uint32_t)MAX2(rgb, 0)) * 0xff / 0x7f); /* g */
         dst[2] = (uint8_t)(((uint32_t)MAX2(rgb, 0)) * 0xff / 0x7f); /* b */
         dst[3] = (uint8_t)(((uint32_t)MAX2(a, 0)) * 0xff / 0x7f); /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         int16_t rgb;
         int16_t a;
         rgb = ((int16_t)(value << 8) ) >> 8;
         a = ((int16_t)(value) ) >> 8;
         dst[0] = (uint8_t)(((uint32_t)MAX2(rgb, 0)) * 0xff / 0x7f); /* r */
         dst[1] = (uint8_t)(((uint32_t)MAX2(rgb, 0)) * 0xff / 0x7f); /* g */
         dst[2] = (uint8_t)(((uint32_t)MAX2(rgb, 0)) * 0xff / 0x7f); /* b */
         dst[3] = (uint8_t)(((uint32_t)MAX2(a, 0)) * 0xff / 0x7f); /* a */
#endif
         src += 2;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_l8a8_snorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint16_t)((uint32_t)((int8_t)(src[0] >> 1)) << 8) ;
         value |= (uint16_t)(((int8_t)(src[3] >> 1)) & 0xff) ;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= (uint16_t)(((int8_t)(src[0] >> 1)) & 0xff) ;
         value |= (uint16_t)((uint32_t)((int8_t)(src[3] >> 1)) << 8) ;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_i8_snorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = *(const uint8_t *)src;
         int8_t rgba;
         rgba = (int8_t)(value) ;
         dst[0] = (float)(rgba * (1.0f/0x7f)); /* r */
         dst[1] = (float)(rgba * (1.0f/0x7f)); /* g */
         dst[2] = (float)(rgba * (1.0f/0x7f)); /* b */
         dst[3] = (float)(rgba * (1.0f/0x7f)); /* a */
         src += 1;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_i8_snorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = 0;
         value |= (uint8_t)((int8_t)util_iround(CLAMP(src[0], -1.0f, 1.0f) * 0x7f)) ;
         *(uint8_t *)dst = value;
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_i8_snorm_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
         uint8_t value = *(const uint8_t *)src;
         int8_t rgba;
         rgba = (int8_t)(value) ;
         dst[0] = (float)(rgba * (1.0f/0x7f)); /* r */
         dst[1] = (float)(rgba * (1.0f/0x7f)); /* g */
         dst[2] = (float)(rgba * (1.0f/0x7f)); /* b */
         dst[3] = (float)(rgba * (1.0f/0x7f)); /* a */
}

static inline void
util_format_i8_snorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = *(const uint8_t *)src;
         int8_t rgba;
         rgba = (int8_t)(value) ;
         dst[0] = (uint8_t)(((uint32_t)MAX2(rgba, 0)) * 0xff / 0x7f); /* r */
         dst[1] = (uint8_t)(((uint32_t)MAX2(rgba, 0)) * 0xff / 0x7f); /* g */
         dst[2] = (uint8_t)(((uint32_t)MAX2(rgba, 0)) * 0xff / 0x7f); /* b */
         dst[3] = (uint8_t)(((uint32_t)MAX2(rgba, 0)) * 0xff / 0x7f); /* a */
         src += 1;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_i8_snorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = 0;
         value |= (uint8_t)((int8_t)(src[0] >> 1)) ;
         *(uint8_t *)dst = value;
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_a16_snorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = *(const uint16_t *)src;
         int16_t a;
         a = (int16_t)(value) ;
         dst[0] = 0; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = (float)(a * (1.0f/0x7fff)); /* a */
         src += 2;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_a16_snorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = 0;
         value |= (uint16_t)((int16_t)util_iround(CLAMP(src[3], -1.0f, 1.0f) * 0x7fff)) ;
         *(uint16_t *)dst = value;
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_a16_snorm_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
         uint16_t value = *(const uint16_t *)src;
         int16_t a;
         a = (int16_t)(value) ;
         dst[0] = 0; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = (float)(a * (1.0f/0x7fff)); /* a */
}

static inline void
util_format_a16_snorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = *(const uint16_t *)src;
         int16_t a;
         a = (int16_t)(value) ;
         dst[0] = 0; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = (uint8_t)(MAX2(a, 0) >> 7); /* a */
         src += 2;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_a16_snorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = 0;
         value |= (uint16_t)((int16_t)(((uint32_t)src[3]) * 0x7fff / 0xff)) ;
         *(uint16_t *)dst = value;
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_l16_snorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = *(const uint16_t *)src;
         int16_t rgb;
         rgb = (int16_t)(value) ;
         dst[0] = (float)(rgb * (1.0f/0x7fff)); /* r */
         dst[1] = (float)(rgb * (1.0f/0x7fff)); /* g */
         dst[2] = (float)(rgb * (1.0f/0x7fff)); /* b */
         dst[3] = 1; /* a */
         src += 2;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_l16_snorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = 0;
         value |= (uint16_t)((int16_t)util_iround(CLAMP(src[0], -1.0f, 1.0f) * 0x7fff)) ;
         *(uint16_t *)dst = value;
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_l16_snorm_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
         uint16_t value = *(const uint16_t *)src;
         int16_t rgb;
         rgb = (int16_t)(value) ;
         dst[0] = (float)(rgb * (1.0f/0x7fff)); /* r */
         dst[1] = (float)(rgb * (1.0f/0x7fff)); /* g */
         dst[2] = (float)(rgb * (1.0f/0x7fff)); /* b */
         dst[3] = 1; /* a */
}

static inline void
util_format_l16_snorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = *(const uint16_t *)src;
         int16_t rgb;
         rgb = (int16_t)(value) ;
         dst[0] = (uint8_t)(MAX2(rgb, 0) >> 7); /* r */
         dst[1] = (uint8_t)(MAX2(rgb, 0) >> 7); /* g */
         dst[2] = (uint8_t)(MAX2(rgb, 0) >> 7); /* b */
         dst[3] = 255; /* a */
         src += 2;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_l16_snorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = 0;
         value |= (uint16_t)((int16_t)(((uint32_t)src[0]) * 0x7fff / 0xff)) ;
         *(uint16_t *)dst = value;
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_l16a16_snorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t rgb;
         int32_t a;
         rgb = ((int32_t)(value) ) >> 16;
         a = ((int32_t)(value << 16) ) >> 16;
         dst[0] = (float)(rgb * (1.0f/0x7fff)); /* r */
         dst[1] = (float)(rgb * (1.0f/0x7fff)); /* g */
         dst[2] = (float)(rgb * (1.0f/0x7fff)); /* b */
         dst[3] = (float)(a * (1.0f/0x7fff)); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t rgb;
         int32_t a;
         rgb = ((int32_t)(value << 16) ) >> 16;
         a = ((int32_t)(value) ) >> 16;
         dst[0] = (float)(rgb * (1.0f/0x7fff)); /* r */
         dst[1] = (float)(rgb * (1.0f/0x7fff)); /* g */
         dst[2] = (float)(rgb * (1.0f/0x7fff)); /* b */
         dst[3] = (float)(a * (1.0f/0x7fff)); /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_l16a16_snorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)((int16_t)util_iround(CLAMP(src[0], -1.0f, 1.0f) * 0x7fff)) << 16) ;
         value |= (uint32_t)(((int16_t)util_iround(CLAMP(src[3], -1.0f, 1.0f) * 0x7fff)) & 0xffff) ;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (uint32_t)(((int16_t)util_iround(CLAMP(src[0], -1.0f, 1.0f) * 0x7fff)) & 0xffff) ;
         value |= (uint32_t)((uint32_t)((int16_t)util_iround(CLAMP(src[3], -1.0f, 1.0f) * 0x7fff)) << 16) ;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_l16a16_snorm_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t rgb;
         int32_t a;
         rgb = ((int32_t)(value) ) >> 16;
         a = ((int32_t)(value << 16) ) >> 16;
         dst[0] = (float)(rgb * (1.0f/0x7fff)); /* r */
         dst[1] = (float)(rgb * (1.0f/0x7fff)); /* g */
         dst[2] = (float)(rgb * (1.0f/0x7fff)); /* b */
         dst[3] = (float)(a * (1.0f/0x7fff)); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t rgb;
         int32_t a;
         rgb = ((int32_t)(value << 16) ) >> 16;
         a = ((int32_t)(value) ) >> 16;
         dst[0] = (float)(rgb * (1.0f/0x7fff)); /* r */
         dst[1] = (float)(rgb * (1.0f/0x7fff)); /* g */
         dst[2] = (float)(rgb * (1.0f/0x7fff)); /* b */
         dst[3] = (float)(a * (1.0f/0x7fff)); /* a */
#endif
}

static inline void
util_format_l16a16_snorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t rgb;
         int32_t a;
         rgb = ((int32_t)(value) ) >> 16;
         a = ((int32_t)(value << 16) ) >> 16;
         dst[0] = (uint8_t)(MAX2(rgb, 0) >> 7); /* r */
         dst[1] = (uint8_t)(MAX2(rgb, 0) >> 7); /* g */
         dst[2] = (uint8_t)(MAX2(rgb, 0) >> 7); /* b */
         dst[3] = (uint8_t)(MAX2(a, 0) >> 7); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t rgb;
         int32_t a;
         rgb = ((int32_t)(value << 16) ) >> 16;
         a = ((int32_t)(value) ) >> 16;
         dst[0] = (uint8_t)(MAX2(rgb, 0) >> 7); /* r */
         dst[1] = (uint8_t)(MAX2(rgb, 0) >> 7); /* g */
         dst[2] = (uint8_t)(MAX2(rgb, 0) >> 7); /* b */
         dst[3] = (uint8_t)(MAX2(a, 0) >> 7); /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_l16a16_snorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)((int16_t)(((uint32_t)src[0]) * 0x7fff / 0xff)) << 16) ;
         value |= (uint32_t)(((int16_t)(((uint32_t)src[3]) * 0x7fff / 0xff)) & 0xffff) ;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (uint32_t)(((int16_t)(((uint32_t)src[0]) * 0x7fff / 0xff)) & 0xffff) ;
         value |= (uint32_t)((uint32_t)((int16_t)(((uint32_t)src[3]) * 0x7fff / 0xff)) << 16) ;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_i16_snorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = *(const uint16_t *)src;
         int16_t rgba;
         rgba = (int16_t)(value) ;
         dst[0] = (float)(rgba * (1.0f/0x7fff)); /* r */
         dst[1] = (float)(rgba * (1.0f/0x7fff)); /* g */
         dst[2] = (float)(rgba * (1.0f/0x7fff)); /* b */
         dst[3] = (float)(rgba * (1.0f/0x7fff)); /* a */
         src += 2;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_i16_snorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = 0;
         value |= (uint16_t)((int16_t)util_iround(CLAMP(src[0], -1.0f, 1.0f) * 0x7fff)) ;
         *(uint16_t *)dst = value;
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_i16_snorm_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
         uint16_t value = *(const uint16_t *)src;
         int16_t rgba;
         rgba = (int16_t)(value) ;
         dst[0] = (float)(rgba * (1.0f/0x7fff)); /* r */
         dst[1] = (float)(rgba * (1.0f/0x7fff)); /* g */
         dst[2] = (float)(rgba * (1.0f/0x7fff)); /* b */
         dst[3] = (float)(rgba * (1.0f/0x7fff)); /* a */
}

static inline void
util_format_i16_snorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = *(const uint16_t *)src;
         int16_t rgba;
         rgba = (int16_t)(value) ;
         dst[0] = (uint8_t)(MAX2(rgba, 0) >> 7); /* r */
         dst[1] = (uint8_t)(MAX2(rgba, 0) >> 7); /* g */
         dst[2] = (uint8_t)(MAX2(rgba, 0) >> 7); /* b */
         dst[3] = (uint8_t)(MAX2(rgba, 0) >> 7); /* a */
         src += 2;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_i16_snorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = 0;
         value |= (uint16_t)((int16_t)(((uint32_t)src[0]) * 0x7fff / 0xff)) ;
         *(uint16_t *)dst = value;
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_a16_float {
   uint16_t a;
};

static inline void
util_format_a16_float_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         struct util_format_a16_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = 0; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = util_half_to_float(pixel.a); /* a */
         src += 2;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_a16_float_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         struct util_format_a16_float pixel;
         pixel.a = util_float_to_half(src[3]);
         memcpy(dst, &pixel, sizeof pixel);
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_a16_float_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
         struct util_format_a16_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = 0; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = util_half_to_float(pixel.a); /* a */
}

static inline void
util_format_a16_float_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         struct util_format_a16_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = 0; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = float_to_ubyte(util_half_to_float(pixel.a)); /* a */
         src += 2;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_a16_float_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         struct util_format_a16_float pixel;
         pixel.a = util_float_to_half((float)(src[3] * (1.0f/0xff)));
         memcpy(dst, &pixel, sizeof pixel);
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_l16_float {
   uint16_t rgb;
};

static inline void
util_format_l16_float_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         struct util_format_l16_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = util_half_to_float(pixel.rgb); /* r */
         dst[1] = util_half_to_float(pixel.rgb); /* g */
         dst[2] = util_half_to_float(pixel.rgb); /* b */
         dst[3] = 1; /* a */
         src += 2;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_l16_float_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         struct util_format_l16_float pixel;
         pixel.rgb = util_float_to_half(src[0]);
         memcpy(dst, &pixel, sizeof pixel);
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_l16_float_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
         struct util_format_l16_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = util_half_to_float(pixel.rgb); /* r */
         dst[1] = util_half_to_float(pixel.rgb); /* g */
         dst[2] = util_half_to_float(pixel.rgb); /* b */
         dst[3] = 1; /* a */
}

static inline void
util_format_l16_float_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         struct util_format_l16_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = float_to_ubyte(util_half_to_float(pixel.rgb)); /* r */
         dst[1] = float_to_ubyte(util_half_to_float(pixel.rgb)); /* g */
         dst[2] = float_to_ubyte(util_half_to_float(pixel.rgb)); /* b */
         dst[3] = 255; /* a */
         src += 2;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_l16_float_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         struct util_format_l16_float pixel;
         pixel.rgb = util_float_to_half((float)(src[0] * (1.0f/0xff)));
         memcpy(dst, &pixel, sizeof pixel);
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_l16a16_float {
#if UTIL_ARCH_BIG_ENDIAN
   uint16_t rgb;
   uint16_t a;
#else
   uint16_t rgb;
   uint16_t a;
#endif
};

static inline void
util_format_l16a16_float_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_l16a16_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = util_half_to_float(pixel.rgb); /* r */
         dst[1] = util_half_to_float(pixel.rgb); /* g */
         dst[2] = util_half_to_float(pixel.rgb); /* b */
         dst[3] = util_half_to_float(pixel.a); /* a */
#else
         struct util_format_l16a16_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = util_half_to_float(pixel.rgb); /* r */
         dst[1] = util_half_to_float(pixel.rgb); /* g */
         dst[2] = util_half_to_float(pixel.rgb); /* b */
         dst[3] = util_half_to_float(pixel.a); /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_l16a16_float_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_l16a16_float pixel;
         pixel.rgb = util_float_to_half(src[0]);
         pixel.a = util_float_to_half(src[3]);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_l16a16_float pixel;
         pixel.rgb = util_float_to_half(src[0]);
         pixel.a = util_float_to_half(src[3]);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_l16a16_float_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_l16a16_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = util_half_to_float(pixel.rgb); /* r */
         dst[1] = util_half_to_float(pixel.rgb); /* g */
         dst[2] = util_half_to_float(pixel.rgb); /* b */
         dst[3] = util_half_to_float(pixel.a); /* a */
#else
         struct util_format_l16a16_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = util_half_to_float(pixel.rgb); /* r */
         dst[1] = util_half_to_float(pixel.rgb); /* g */
         dst[2] = util_half_to_float(pixel.rgb); /* b */
         dst[3] = util_half_to_float(pixel.a); /* a */
#endif
}

static inline void
util_format_l16a16_float_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_l16a16_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = float_to_ubyte(util_half_to_float(pixel.rgb)); /* r */
         dst[1] = float_to_ubyte(util_half_to_float(pixel.rgb)); /* g */
         dst[2] = float_to_ubyte(util_half_to_float(pixel.rgb)); /* b */
         dst[3] = float_to_ubyte(util_half_to_float(pixel.a)); /* a */
#else
         struct util_format_l16a16_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = float_to_ubyte(util_half_to_float(pixel.rgb)); /* r */
         dst[1] = float_to_ubyte(util_half_to_float(pixel.rgb)); /* g */
         dst[2] = float_to_ubyte(util_half_to_float(pixel.rgb)); /* b */
         dst[3] = float_to_ubyte(util_half_to_float(pixel.a)); /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_l16a16_float_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_l16a16_float pixel;
         pixel.rgb = util_float_to_half((float)(src[0] * (1.0f/0xff)));
         pixel.a = util_float_to_half((float)(src[3] * (1.0f/0xff)));
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_l16a16_float pixel;
         pixel.rgb = util_float_to_half((float)(src[0] * (1.0f/0xff)));
         pixel.a = util_float_to_half((float)(src[3] * (1.0f/0xff)));
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_i16_float {
   uint16_t rgba;
};

static inline void
util_format_i16_float_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         struct util_format_i16_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = util_half_to_float(pixel.rgba); /* r */
         dst[1] = util_half_to_float(pixel.rgba); /* g */
         dst[2] = util_half_to_float(pixel.rgba); /* b */
         dst[3] = util_half_to_float(pixel.rgba); /* a */
         src += 2;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_i16_float_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         struct util_format_i16_float pixel;
         pixel.rgba = util_float_to_half(src[0]);
         memcpy(dst, &pixel, sizeof pixel);
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_i16_float_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
         struct util_format_i16_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = util_half_to_float(pixel.rgba); /* r */
         dst[1] = util_half_to_float(pixel.rgba); /* g */
         dst[2] = util_half_to_float(pixel.rgba); /* b */
         dst[3] = util_half_to_float(pixel.rgba); /* a */
}

static inline void
util_format_i16_float_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         struct util_format_i16_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = float_to_ubyte(util_half_to_float(pixel.rgba)); /* r */
         dst[1] = float_to_ubyte(util_half_to_float(pixel.rgba)); /* g */
         dst[2] = float_to_ubyte(util_half_to_float(pixel.rgba)); /* b */
         dst[3] = float_to_ubyte(util_half_to_float(pixel.rgba)); /* a */
         src += 2;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_i16_float_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         struct util_format_i16_float pixel;
         pixel.rgba = util_float_to_half((float)(src[0] * (1.0f/0xff)));
         memcpy(dst, &pixel, sizeof pixel);
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_a32_float {
   float a;
};

static inline void
util_format_a32_float_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         struct util_format_a32_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = 0; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = pixel.a; /* a */
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_a32_float_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         struct util_format_a32_float pixel;
         pixel.a = src[3];
         memcpy(dst, &pixel, sizeof pixel);
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_a32_float_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
         struct util_format_a32_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = 0; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = pixel.a; /* a */
}

static inline void
util_format_a32_float_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         struct util_format_a32_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = 0; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = float_to_ubyte(pixel.a); /* a */
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_a32_float_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         struct util_format_a32_float pixel;
         pixel.a = ubyte_to_float(src[3]);
         memcpy(dst, &pixel, sizeof pixel);
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_l32_float {
   float rgb;
};

static inline void
util_format_l32_float_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         struct util_format_l32_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.rgb; /* r */
         dst[1] = pixel.rgb; /* g */
         dst[2] = pixel.rgb; /* b */
         dst[3] = 1; /* a */
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_l32_float_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         struct util_format_l32_float pixel;
         pixel.rgb = src[0];
         memcpy(dst, &pixel, sizeof pixel);
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_l32_float_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
         struct util_format_l32_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.rgb; /* r */
         dst[1] = pixel.rgb; /* g */
         dst[2] = pixel.rgb; /* b */
         dst[3] = 1; /* a */
}

static inline void
util_format_l32_float_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         struct util_format_l32_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = float_to_ubyte(pixel.rgb); /* r */
         dst[1] = float_to_ubyte(pixel.rgb); /* g */
         dst[2] = float_to_ubyte(pixel.rgb); /* b */
         dst[3] = 255; /* a */
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_l32_float_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         struct util_format_l32_float pixel;
         pixel.rgb = ubyte_to_float(src[0]);
         memcpy(dst, &pixel, sizeof pixel);
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_l32a32_float {
#if UTIL_ARCH_BIG_ENDIAN
   float rgb;
   float a;
#else
   float rgb;
   float a;
#endif
};

static inline void
util_format_l32a32_float_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_l32a32_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.rgb; /* r */
         dst[1] = pixel.rgb; /* g */
         dst[2] = pixel.rgb; /* b */
         dst[3] = pixel.a; /* a */
#else
         struct util_format_l32a32_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.rgb; /* r */
         dst[1] = pixel.rgb; /* g */
         dst[2] = pixel.rgb; /* b */
         dst[3] = pixel.a; /* a */
#endif
         src += 8;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_l32a32_float_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_l32a32_float pixel;
         pixel.rgb = src[0];
         pixel.a = src[3];
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_l32a32_float pixel;
         pixel.rgb = src[0];
         pixel.a = src[3];
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 8;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_l32a32_float_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_l32a32_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.rgb; /* r */
         dst[1] = pixel.rgb; /* g */
         dst[2] = pixel.rgb; /* b */
         dst[3] = pixel.a; /* a */
#else
         struct util_format_l32a32_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.rgb; /* r */
         dst[1] = pixel.rgb; /* g */
         dst[2] = pixel.rgb; /* b */
         dst[3] = pixel.a; /* a */
#endif
}

static inline void
util_format_l32a32_float_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_l32a32_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = float_to_ubyte(pixel.rgb); /* r */
         dst[1] = float_to_ubyte(pixel.rgb); /* g */
         dst[2] = float_to_ubyte(pixel.rgb); /* b */
         dst[3] = float_to_ubyte(pixel.a); /* a */
#else
         struct util_format_l32a32_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = float_to_ubyte(pixel.rgb); /* r */
         dst[1] = float_to_ubyte(pixel.rgb); /* g */
         dst[2] = float_to_ubyte(pixel.rgb); /* b */
         dst[3] = float_to_ubyte(pixel.a); /* a */
#endif
         src += 8;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_l32a32_float_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_l32a32_float pixel;
         pixel.rgb = ubyte_to_float(src[0]);
         pixel.a = ubyte_to_float(src[3]);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_l32a32_float pixel;
         pixel.rgb = ubyte_to_float(src[0]);
         pixel.a = ubyte_to_float(src[3]);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 8;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_i32_float {
   float rgba;
};

static inline void
util_format_i32_float_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         struct util_format_i32_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.rgba; /* r */
         dst[1] = pixel.rgba; /* g */
         dst[2] = pixel.rgba; /* b */
         dst[3] = pixel.rgba; /* a */
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_i32_float_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         struct util_format_i32_float pixel;
         pixel.rgba = src[0];
         memcpy(dst, &pixel, sizeof pixel);
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_i32_float_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
         struct util_format_i32_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.rgba; /* r */
         dst[1] = pixel.rgba; /* g */
         dst[2] = pixel.rgba; /* b */
         dst[3] = pixel.rgba; /* a */
}

static inline void
util_format_i32_float_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         struct util_format_i32_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = float_to_ubyte(pixel.rgba); /* r */
         dst[1] = float_to_ubyte(pixel.rgba); /* g */
         dst[2] = float_to_ubyte(pixel.rgba); /* b */
         dst[3] = float_to_ubyte(pixel.rgba); /* a */
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_i32_float_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         struct util_format_i32_float pixel;
         pixel.rgba = ubyte_to_float(src[0]);
         memcpy(dst, &pixel, sizeof pixel);
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_l8_srgb_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = *(const uint8_t *)src;
         uint8_t rgb;
         rgb = value;
         dst[0] = util_format_srgb_8unorm_to_linear_float(rgb); /* r */
         dst[1] = util_format_srgb_8unorm_to_linear_float(rgb); /* g */
         dst[2] = util_format_srgb_8unorm_to_linear_float(rgb); /* b */
         dst[3] = 1; /* a */
         src += 1;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_l8_srgb_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = 0;
         value |= util_format_linear_float_to_srgb_8unorm(src[0]);
         *(uint8_t *)dst = value;
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_l8_srgb_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
         uint8_t value = *(const uint8_t *)src;
         uint8_t rgb;
         rgb = value;
         dst[0] = util_format_srgb_8unorm_to_linear_float(rgb); /* r */
         dst[1] = util_format_srgb_8unorm_to_linear_float(rgb); /* g */
         dst[2] = util_format_srgb_8unorm_to_linear_float(rgb); /* b */
         dst[3] = 1; /* a */
}

static inline void
util_format_l8_srgb_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = *(const uint8_t *)src;
         uint8_t rgb;
         rgb = value;
         dst[0] = util_format_srgb_to_linear_8unorm(rgb); /* r */
         dst[1] = util_format_srgb_to_linear_8unorm(rgb); /* g */
         dst[2] = util_format_srgb_to_linear_8unorm(rgb); /* b */
         dst[3] = 255; /* a */
         src += 1;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_l8_srgb_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = 0;
         value |= util_format_linear_to_srgb_8unorm(src[0]);
         *(uint8_t *)dst = value;
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r8_srgb_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = *(const uint8_t *)src;
         uint8_t r;
         r = value;
         dst[0] = util_format_srgb_8unorm_to_linear_float(r); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
         src += 1;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r8_srgb_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = 0;
         value |= util_format_linear_float_to_srgb_8unorm(src[0]);
         *(uint8_t *)dst = value;
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r8_srgb_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
         uint8_t value = *(const uint8_t *)src;
         uint8_t r;
         r = value;
         dst[0] = util_format_srgb_8unorm_to_linear_float(r); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
}

static inline void
util_format_r8_srgb_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = *(const uint8_t *)src;
         uint8_t r;
         r = value;
         dst[0] = util_format_srgb_to_linear_8unorm(r); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 255; /* a */
         src += 1;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r8_srgb_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = 0;
         value |= util_format_linear_to_srgb_8unorm(src[0]);
         *(uint8_t *)dst = value;
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_l8a8_srgb_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t rgb;
         uint16_t a;
         rgb = value >> 8;
         a = (value) & 0xff;
         dst[0] = util_format_srgb_8unorm_to_linear_float(rgb); /* r */
         dst[1] = util_format_srgb_8unorm_to_linear_float(rgb); /* g */
         dst[2] = util_format_srgb_8unorm_to_linear_float(rgb); /* b */
         dst[3] = ubyte_to_float(a); /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t rgb;
         uint16_t a;
         rgb = (value) & 0xff;
         a = value >> 8;
         dst[0] = util_format_srgb_8unorm_to_linear_float(rgb); /* r */
         dst[1] = util_format_srgb_8unorm_to_linear_float(rgb); /* g */
         dst[2] = util_format_srgb_8unorm_to_linear_float(rgb); /* b */
         dst[3] = ubyte_to_float(a); /* a */
#endif
         src += 2;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_l8a8_srgb_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)(util_format_linear_float_to_srgb_8unorm(src[0])) << 8;
         value |= (float_to_ubyte(src[3])) & 0xff;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= (util_format_linear_float_to_srgb_8unorm(src[0])) & 0xff;
         value |= (uint32_t)(float_to_ubyte(src[3])) << 8;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_l8a8_srgb_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t rgb;
         uint16_t a;
         rgb = value >> 8;
         a = (value) & 0xff;
         dst[0] = util_format_srgb_8unorm_to_linear_float(rgb); /* r */
         dst[1] = util_format_srgb_8unorm_to_linear_float(rgb); /* g */
         dst[2] = util_format_srgb_8unorm_to_linear_float(rgb); /* b */
         dst[3] = ubyte_to_float(a); /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t rgb;
         uint16_t a;
         rgb = (value) & 0xff;
         a = value >> 8;
         dst[0] = util_format_srgb_8unorm_to_linear_float(rgb); /* r */
         dst[1] = util_format_srgb_8unorm_to_linear_float(rgb); /* g */
         dst[2] = util_format_srgb_8unorm_to_linear_float(rgb); /* b */
         dst[3] = ubyte_to_float(a); /* a */
#endif
}

static inline void
util_format_l8a8_srgb_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t rgb;
         uint16_t a;
         rgb = value >> 8;
         a = (value) & 0xff;
         dst[0] = util_format_srgb_to_linear_8unorm(rgb); /* r */
         dst[1] = util_format_srgb_to_linear_8unorm(rgb); /* g */
         dst[2] = util_format_srgb_to_linear_8unorm(rgb); /* b */
         dst[3] = a; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t rgb;
         uint16_t a;
         rgb = (value) & 0xff;
         a = value >> 8;
         dst[0] = util_format_srgb_to_linear_8unorm(rgb); /* r */
         dst[1] = util_format_srgb_to_linear_8unorm(rgb); /* g */
         dst[2] = util_format_srgb_to_linear_8unorm(rgb); /* b */
         dst[3] = a; /* a */
#endif
         src += 2;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_l8a8_srgb_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)(util_format_linear_to_srgb_8unorm(src[0])) << 8;
         value |= (src[3]) & 0xff;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= (util_format_linear_to_srgb_8unorm(src[0])) & 0xff;
         value |= (uint32_t)(src[3]) << 8;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r8g8_srgb_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t r;
         uint16_t g;
         r = value >> 8;
         g = (value) & 0xff;
         dst[0] = util_format_srgb_8unorm_to_linear_float(r); /* r */
         dst[1] = util_format_srgb_8unorm_to_linear_float(g); /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t r;
         uint16_t g;
         r = (value) & 0xff;
         g = value >> 8;
         dst[0] = util_format_srgb_8unorm_to_linear_float(r); /* r */
         dst[1] = util_format_srgb_8unorm_to_linear_float(g); /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#endif
         src += 2;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r8g8_srgb_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)(util_format_linear_float_to_srgb_8unorm(src[0])) << 8;
         value |= (util_format_linear_float_to_srgb_8unorm(src[1])) & 0xff;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= (util_format_linear_float_to_srgb_8unorm(src[0])) & 0xff;
         value |= (uint32_t)(util_format_linear_float_to_srgb_8unorm(src[1])) << 8;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r8g8_srgb_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t r;
         uint16_t g;
         r = value >> 8;
         g = (value) & 0xff;
         dst[0] = util_format_srgb_8unorm_to_linear_float(r); /* r */
         dst[1] = util_format_srgb_8unorm_to_linear_float(g); /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t r;
         uint16_t g;
         r = (value) & 0xff;
         g = value >> 8;
         dst[0] = util_format_srgb_8unorm_to_linear_float(r); /* r */
         dst[1] = util_format_srgb_8unorm_to_linear_float(g); /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#endif
}

static inline void
util_format_r8g8_srgb_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t r;
         uint16_t g;
         r = value >> 8;
         g = (value) & 0xff;
         dst[0] = util_format_srgb_to_linear_8unorm(r); /* r */
         dst[1] = util_format_srgb_to_linear_8unorm(g); /* g */
         dst[2] = 0; /* b */
         dst[3] = 255; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t r;
         uint16_t g;
         r = (value) & 0xff;
         g = value >> 8;
         dst[0] = util_format_srgb_to_linear_8unorm(r); /* r */
         dst[1] = util_format_srgb_to_linear_8unorm(g); /* g */
         dst[2] = 0; /* b */
         dst[3] = 255; /* a */
#endif
         src += 2;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r8g8_srgb_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)(util_format_linear_to_srgb_8unorm(src[0])) << 8;
         value |= (util_format_linear_to_srgb_8unorm(src[1])) & 0xff;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= (util_format_linear_to_srgb_8unorm(src[0])) & 0xff;
         value |= (uint32_t)(util_format_linear_to_srgb_8unorm(src[1])) << 8;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r8g8b8_srgb {
#if UTIL_ARCH_BIG_ENDIAN
   uint8_t r;
   uint8_t g;
   uint8_t b;
#else
   uint8_t r;
   uint8_t g;
   uint8_t b;
#endif
};

static inline void
util_format_r8g8b8_srgb_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r8g8b8_srgb pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = util_format_srgb_8unorm_to_linear_float(pixel.r); /* r */
         dst[1] = util_format_srgb_8unorm_to_linear_float(pixel.g); /* g */
         dst[2] = util_format_srgb_8unorm_to_linear_float(pixel.b); /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r8g8b8_srgb pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = util_format_srgb_8unorm_to_linear_float(pixel.r); /* r */
         dst[1] = util_format_srgb_8unorm_to_linear_float(pixel.g); /* g */
         dst[2] = util_format_srgb_8unorm_to_linear_float(pixel.b); /* b */
         dst[3] = 1; /* a */
#endif
         src += 3;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r8g8b8_srgb_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r8g8b8_srgb pixel;
         pixel.r = util_format_linear_float_to_srgb_8unorm(src[0]);
         pixel.g = util_format_linear_float_to_srgb_8unorm(src[1]);
         pixel.b = util_format_linear_float_to_srgb_8unorm(src[2]);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r8g8b8_srgb pixel;
         pixel.r = util_format_linear_float_to_srgb_8unorm(src[0]);
         pixel.g = util_format_linear_float_to_srgb_8unorm(src[1]);
         pixel.b = util_format_linear_float_to_srgb_8unorm(src[2]);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 3;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r8g8b8_srgb_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r8g8b8_srgb pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = util_format_srgb_8unorm_to_linear_float(pixel.r); /* r */
         dst[1] = util_format_srgb_8unorm_to_linear_float(pixel.g); /* g */
         dst[2] = util_format_srgb_8unorm_to_linear_float(pixel.b); /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r8g8b8_srgb pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = util_format_srgb_8unorm_to_linear_float(pixel.r); /* r */
         dst[1] = util_format_srgb_8unorm_to_linear_float(pixel.g); /* g */
         dst[2] = util_format_srgb_8unorm_to_linear_float(pixel.b); /* b */
         dst[3] = 1; /* a */
#endif
}

static inline void
util_format_r8g8b8_srgb_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r8g8b8_srgb pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = util_format_srgb_to_linear_8unorm(pixel.r); /* r */
         dst[1] = util_format_srgb_to_linear_8unorm(pixel.g); /* g */
         dst[2] = util_format_srgb_to_linear_8unorm(pixel.b); /* b */
         dst[3] = 255; /* a */
#else
         struct util_format_r8g8b8_srgb pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = util_format_srgb_to_linear_8unorm(pixel.r); /* r */
         dst[1] = util_format_srgb_to_linear_8unorm(pixel.g); /* g */
         dst[2] = util_format_srgb_to_linear_8unorm(pixel.b); /* b */
         dst[3] = 255; /* a */
#endif
         src += 3;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r8g8b8_srgb_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r8g8b8_srgb pixel;
         pixel.r = util_format_linear_to_srgb_8unorm(src[0]);
         pixel.g = util_format_linear_to_srgb_8unorm(src[1]);
         pixel.b = util_format_linear_to_srgb_8unorm(src[2]);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r8g8b8_srgb pixel;
         pixel.r = util_format_linear_to_srgb_8unorm(src[0]);
         pixel.g = util_format_linear_to_srgb_8unorm(src[1]);
         pixel.b = util_format_linear_to_srgb_8unorm(src[2]);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 3;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_b8g8r8_srgb {
#if UTIL_ARCH_BIG_ENDIAN
   uint8_t b;
   uint8_t g;
   uint8_t r;
#else
   uint8_t b;
   uint8_t g;
   uint8_t r;
#endif
};

static inline void
util_format_b8g8r8_srgb_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_b8g8r8_srgb pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = util_format_srgb_8unorm_to_linear_float(pixel.r); /* r */
         dst[1] = util_format_srgb_8unorm_to_linear_float(pixel.g); /* g */
         dst[2] = util_format_srgb_8unorm_to_linear_float(pixel.b); /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_b8g8r8_srgb pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = util_format_srgb_8unorm_to_linear_float(pixel.r); /* r */
         dst[1] = util_format_srgb_8unorm_to_linear_float(pixel.g); /* g */
         dst[2] = util_format_srgb_8unorm_to_linear_float(pixel.b); /* b */
         dst[3] = 1; /* a */
#endif
         src += 3;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_b8g8r8_srgb_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_b8g8r8_srgb pixel;
         pixel.b = util_format_linear_float_to_srgb_8unorm(src[2]);
         pixel.g = util_format_linear_float_to_srgb_8unorm(src[1]);
         pixel.r = util_format_linear_float_to_srgb_8unorm(src[0]);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_b8g8r8_srgb pixel;
         pixel.b = util_format_linear_float_to_srgb_8unorm(src[2]);
         pixel.g = util_format_linear_float_to_srgb_8unorm(src[1]);
         pixel.r = util_format_linear_float_to_srgb_8unorm(src[0]);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 3;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_b8g8r8_srgb_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_b8g8r8_srgb pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = util_format_srgb_8unorm_to_linear_float(pixel.r); /* r */
         dst[1] = util_format_srgb_8unorm_to_linear_float(pixel.g); /* g */
         dst[2] = util_format_srgb_8unorm_to_linear_float(pixel.b); /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_b8g8r8_srgb pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = util_format_srgb_8unorm_to_linear_float(pixel.r); /* r */
         dst[1] = util_format_srgb_8unorm_to_linear_float(pixel.g); /* g */
         dst[2] = util_format_srgb_8unorm_to_linear_float(pixel.b); /* b */
         dst[3] = 1; /* a */
#endif
}

static inline void
util_format_b8g8r8_srgb_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_b8g8r8_srgb pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = util_format_srgb_to_linear_8unorm(pixel.r); /* r */
         dst[1] = util_format_srgb_to_linear_8unorm(pixel.g); /* g */
         dst[2] = util_format_srgb_to_linear_8unorm(pixel.b); /* b */
         dst[3] = 255; /* a */
#else
         struct util_format_b8g8r8_srgb pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = util_format_srgb_to_linear_8unorm(pixel.r); /* r */
         dst[1] = util_format_srgb_to_linear_8unorm(pixel.g); /* g */
         dst[2] = util_format_srgb_to_linear_8unorm(pixel.b); /* b */
         dst[3] = 255; /* a */
#endif
         src += 3;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_b8g8r8_srgb_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_b8g8r8_srgb pixel;
         pixel.b = util_format_linear_to_srgb_8unorm(src[2]);
         pixel.g = util_format_linear_to_srgb_8unorm(src[1]);
         pixel.r = util_format_linear_to_srgb_8unorm(src[0]);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_b8g8r8_srgb pixel;
         pixel.b = util_format_linear_to_srgb_8unorm(src[2]);
         pixel.g = util_format_linear_to_srgb_8unorm(src[1]);
         pixel.r = util_format_linear_to_srgb_8unorm(src[0]);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 3;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r8g8b8a8_srgb_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t r;
         uint32_t g;
         uint32_t b;
         uint32_t a;
         r = value >> 24;
         g = (value >> 16) & 0xff;
         b = (value >> 8) & 0xff;
         a = (value) & 0xff;
         dst[0] = util_format_srgb_8unorm_to_linear_float(r); /* r */
         dst[1] = util_format_srgb_8unorm_to_linear_float(g); /* g */
         dst[2] = util_format_srgb_8unorm_to_linear_float(b); /* b */
         dst[3] = ubyte_to_float(a); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t r;
         uint32_t g;
         uint32_t b;
         uint32_t a;
         r = (value) & 0xff;
         g = (value >> 8) & 0xff;
         b = (value >> 16) & 0xff;
         a = value >> 24;
         dst[0] = util_format_srgb_8unorm_to_linear_float(r); /* r */
         dst[1] = util_format_srgb_8unorm_to_linear_float(g); /* g */
         dst[2] = util_format_srgb_8unorm_to_linear_float(b); /* b */
         dst[3] = ubyte_to_float(a); /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r8g8b8a8_srgb_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)(util_format_linear_float_to_srgb_8unorm(src[0])) << 24;
         value |= (uint32_t)((util_format_linear_float_to_srgb_8unorm(src[1])) & 0xff) << 16;
         value |= (uint32_t)((util_format_linear_float_to_srgb_8unorm(src[2])) & 0xff) << 8;
         value |= (float_to_ubyte(src[3])) & 0xff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (util_format_linear_float_to_srgb_8unorm(src[0])) & 0xff;
         value |= (uint32_t)((util_format_linear_float_to_srgb_8unorm(src[1])) & 0xff) << 8;
         value |= (uint32_t)((util_format_linear_float_to_srgb_8unorm(src[2])) & 0xff) << 16;
         value |= (uint32_t)(float_to_ubyte(src[3])) << 24;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r8g8b8a8_srgb_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t r;
         uint32_t g;
         uint32_t b;
         uint32_t a;
         r = value >> 24;
         g = (value >> 16) & 0xff;
         b = (value >> 8) & 0xff;
         a = (value) & 0xff;
         dst[0] = util_format_srgb_8unorm_to_linear_float(r); /* r */
         dst[1] = util_format_srgb_8unorm_to_linear_float(g); /* g */
         dst[2] = util_format_srgb_8unorm_to_linear_float(b); /* b */
         dst[3] = ubyte_to_float(a); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t r;
         uint32_t g;
         uint32_t b;
         uint32_t a;
         r = (value) & 0xff;
         g = (value >> 8) & 0xff;
         b = (value >> 16) & 0xff;
         a = value >> 24;
         dst[0] = util_format_srgb_8unorm_to_linear_float(r); /* r */
         dst[1] = util_format_srgb_8unorm_to_linear_float(g); /* g */
         dst[2] = util_format_srgb_8unorm_to_linear_float(b); /* b */
         dst[3] = ubyte_to_float(a); /* a */
#endif
}

static inline void
util_format_r8g8b8a8_srgb_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t r;
         uint32_t g;
         uint32_t b;
         uint32_t a;
         r = value >> 24;
         g = (value >> 16) & 0xff;
         b = (value >> 8) & 0xff;
         a = (value) & 0xff;
         dst[0] = util_format_srgb_to_linear_8unorm(r); /* r */
         dst[1] = util_format_srgb_to_linear_8unorm(g); /* g */
         dst[2] = util_format_srgb_to_linear_8unorm(b); /* b */
         dst[3] = a; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t r;
         uint32_t g;
         uint32_t b;
         uint32_t a;
         r = (value) & 0xff;
         g = (value >> 8) & 0xff;
         b = (value >> 16) & 0xff;
         a = value >> 24;
         dst[0] = util_format_srgb_to_linear_8unorm(r); /* r */
         dst[1] = util_format_srgb_to_linear_8unorm(g); /* g */
         dst[2] = util_format_srgb_to_linear_8unorm(b); /* b */
         dst[3] = a; /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r8g8b8a8_srgb_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)(util_format_linear_to_srgb_8unorm(src[0])) << 24;
         value |= (uint32_t)((util_format_linear_to_srgb_8unorm(src[1])) & 0xff) << 16;
         value |= (uint32_t)((util_format_linear_to_srgb_8unorm(src[2])) & 0xff) << 8;
         value |= (src[3]) & 0xff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (util_format_linear_to_srgb_8unorm(src[0])) & 0xff;
         value |= (uint32_t)((util_format_linear_to_srgb_8unorm(src[1])) & 0xff) << 8;
         value |= (uint32_t)((util_format_linear_to_srgb_8unorm(src[2])) & 0xff) << 16;
         value |= (uint32_t)(src[3]) << 24;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_a8b8g8r8_srgb_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t a;
         uint32_t b;
         uint32_t g;
         uint32_t r;
         a = value >> 24;
         b = (value >> 16) & 0xff;
         g = (value >> 8) & 0xff;
         r = (value) & 0xff;
         dst[0] = util_format_srgb_8unorm_to_linear_float(r); /* r */
         dst[1] = util_format_srgb_8unorm_to_linear_float(g); /* g */
         dst[2] = util_format_srgb_8unorm_to_linear_float(b); /* b */
         dst[3] = ubyte_to_float(a); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t a;
         uint32_t b;
         uint32_t g;
         uint32_t r;
         a = (value) & 0xff;
         b = (value >> 8) & 0xff;
         g = (value >> 16) & 0xff;
         r = value >> 24;
         dst[0] = util_format_srgb_8unorm_to_linear_float(r); /* r */
         dst[1] = util_format_srgb_8unorm_to_linear_float(g); /* g */
         dst[2] = util_format_srgb_8unorm_to_linear_float(b); /* b */
         dst[3] = ubyte_to_float(a); /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_a8b8g8r8_srgb_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)(float_to_ubyte(src[3])) << 24;
         value |= (uint32_t)((util_format_linear_float_to_srgb_8unorm(src[2])) & 0xff) << 16;
         value |= (uint32_t)((util_format_linear_float_to_srgb_8unorm(src[1])) & 0xff) << 8;
         value |= (util_format_linear_float_to_srgb_8unorm(src[0])) & 0xff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (float_to_ubyte(src[3])) & 0xff;
         value |= (uint32_t)((util_format_linear_float_to_srgb_8unorm(src[2])) & 0xff) << 8;
         value |= (uint32_t)((util_format_linear_float_to_srgb_8unorm(src[1])) & 0xff) << 16;
         value |= (uint32_t)(util_format_linear_float_to_srgb_8unorm(src[0])) << 24;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_a8b8g8r8_srgb_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t a;
         uint32_t b;
         uint32_t g;
         uint32_t r;
         a = value >> 24;
         b = (value >> 16) & 0xff;
         g = (value >> 8) & 0xff;
         r = (value) & 0xff;
         dst[0] = util_format_srgb_8unorm_to_linear_float(r); /* r */
         dst[1] = util_format_srgb_8unorm_to_linear_float(g); /* g */
         dst[2] = util_format_srgb_8unorm_to_linear_float(b); /* b */
         dst[3] = ubyte_to_float(a); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t a;
         uint32_t b;
         uint32_t g;
         uint32_t r;
         a = (value) & 0xff;
         b = (value >> 8) & 0xff;
         g = (value >> 16) & 0xff;
         r = value >> 24;
         dst[0] = util_format_srgb_8unorm_to_linear_float(r); /* r */
         dst[1] = util_format_srgb_8unorm_to_linear_float(g); /* g */
         dst[2] = util_format_srgb_8unorm_to_linear_float(b); /* b */
         dst[3] = ubyte_to_float(a); /* a */
#endif
}

static inline void
util_format_a8b8g8r8_srgb_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t a;
         uint32_t b;
         uint32_t g;
         uint32_t r;
         a = value >> 24;
         b = (value >> 16) & 0xff;
         g = (value >> 8) & 0xff;
         r = (value) & 0xff;
         dst[0] = util_format_srgb_to_linear_8unorm(r); /* r */
         dst[1] = util_format_srgb_to_linear_8unorm(g); /* g */
         dst[2] = util_format_srgb_to_linear_8unorm(b); /* b */
         dst[3] = a; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t a;
         uint32_t b;
         uint32_t g;
         uint32_t r;
         a = (value) & 0xff;
         b = (value >> 8) & 0xff;
         g = (value >> 16) & 0xff;
         r = value >> 24;
         dst[0] = util_format_srgb_to_linear_8unorm(r); /* r */
         dst[1] = util_format_srgb_to_linear_8unorm(g); /* g */
         dst[2] = util_format_srgb_to_linear_8unorm(b); /* b */
         dst[3] = a; /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_a8b8g8r8_srgb_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)(src[3]) << 24;
         value |= (uint32_t)((util_format_linear_to_srgb_8unorm(src[2])) & 0xff) << 16;
         value |= (uint32_t)((util_format_linear_to_srgb_8unorm(src[1])) & 0xff) << 8;
         value |= (util_format_linear_to_srgb_8unorm(src[0])) & 0xff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (src[3]) & 0xff;
         value |= (uint32_t)((util_format_linear_to_srgb_8unorm(src[2])) & 0xff) << 8;
         value |= (uint32_t)((util_format_linear_to_srgb_8unorm(src[1])) & 0xff) << 16;
         value |= (uint32_t)(util_format_linear_to_srgb_8unorm(src[0])) << 24;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_x8b8g8r8_srgb_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t b;
         uint32_t g;
         uint32_t r;
         b = (value >> 16) & 0xff;
         g = (value >> 8) & 0xff;
         r = (value) & 0xff;
         dst[0] = util_format_srgb_8unorm_to_linear_float(r); /* r */
         dst[1] = util_format_srgb_8unorm_to_linear_float(g); /* g */
         dst[2] = util_format_srgb_8unorm_to_linear_float(b); /* b */
         dst[3] = 1; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t b;
         uint32_t g;
         uint32_t r;
         b = (value >> 8) & 0xff;
         g = (value >> 16) & 0xff;
         r = value >> 24;
         dst[0] = util_format_srgb_8unorm_to_linear_float(r); /* r */
         dst[1] = util_format_srgb_8unorm_to_linear_float(g); /* g */
         dst[2] = util_format_srgb_8unorm_to_linear_float(b); /* b */
         dst[3] = 1; /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_x8b8g8r8_srgb_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((util_format_linear_float_to_srgb_8unorm(src[2])) & 0xff) << 16;
         value |= (uint32_t)((util_format_linear_float_to_srgb_8unorm(src[1])) & 0xff) << 8;
         value |= (util_format_linear_float_to_srgb_8unorm(src[0])) & 0xff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (uint32_t)((util_format_linear_float_to_srgb_8unorm(src[2])) & 0xff) << 8;
         value |= (uint32_t)((util_format_linear_float_to_srgb_8unorm(src[1])) & 0xff) << 16;
         value |= (uint32_t)(util_format_linear_float_to_srgb_8unorm(src[0])) << 24;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_x8b8g8r8_srgb_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t b;
         uint32_t g;
         uint32_t r;
         b = (value >> 16) & 0xff;
         g = (value >> 8) & 0xff;
         r = (value) & 0xff;
         dst[0] = util_format_srgb_8unorm_to_linear_float(r); /* r */
         dst[1] = util_format_srgb_8unorm_to_linear_float(g); /* g */
         dst[2] = util_format_srgb_8unorm_to_linear_float(b); /* b */
         dst[3] = 1; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t b;
         uint32_t g;
         uint32_t r;
         b = (value >> 8) & 0xff;
         g = (value >> 16) & 0xff;
         r = value >> 24;
         dst[0] = util_format_srgb_8unorm_to_linear_float(r); /* r */
         dst[1] = util_format_srgb_8unorm_to_linear_float(g); /* g */
         dst[2] = util_format_srgb_8unorm_to_linear_float(b); /* b */
         dst[3] = 1; /* a */
#endif
}

static inline void
util_format_x8b8g8r8_srgb_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t b;
         uint32_t g;
         uint32_t r;
         b = (value >> 16) & 0xff;
         g = (value >> 8) & 0xff;
         r = (value) & 0xff;
         dst[0] = util_format_srgb_to_linear_8unorm(r); /* r */
         dst[1] = util_format_srgb_to_linear_8unorm(g); /* g */
         dst[2] = util_format_srgb_to_linear_8unorm(b); /* b */
         dst[3] = 255; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t b;
         uint32_t g;
         uint32_t r;
         b = (value >> 8) & 0xff;
         g = (value >> 16) & 0xff;
         r = value >> 24;
         dst[0] = util_format_srgb_to_linear_8unorm(r); /* r */
         dst[1] = util_format_srgb_to_linear_8unorm(g); /* g */
         dst[2] = util_format_srgb_to_linear_8unorm(b); /* b */
         dst[3] = 255; /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_x8b8g8r8_srgb_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((util_format_linear_to_srgb_8unorm(src[2])) & 0xff) << 16;
         value |= (uint32_t)((util_format_linear_to_srgb_8unorm(src[1])) & 0xff) << 8;
         value |= (util_format_linear_to_srgb_8unorm(src[0])) & 0xff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (uint32_t)((util_format_linear_to_srgb_8unorm(src[2])) & 0xff) << 8;
         value |= (uint32_t)((util_format_linear_to_srgb_8unorm(src[1])) & 0xff) << 16;
         value |= (uint32_t)(util_format_linear_to_srgb_8unorm(src[0])) << 24;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_b8g8r8a8_srgb_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t b;
         uint32_t g;
         uint32_t r;
         uint32_t a;
         b = value >> 24;
         g = (value >> 16) & 0xff;
         r = (value >> 8) & 0xff;
         a = (value) & 0xff;
         dst[0] = util_format_srgb_8unorm_to_linear_float(r); /* r */
         dst[1] = util_format_srgb_8unorm_to_linear_float(g); /* g */
         dst[2] = util_format_srgb_8unorm_to_linear_float(b); /* b */
         dst[3] = ubyte_to_float(a); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t b;
         uint32_t g;
         uint32_t r;
         uint32_t a;
         b = (value) & 0xff;
         g = (value >> 8) & 0xff;
         r = (value >> 16) & 0xff;
         a = value >> 24;
         dst[0] = util_format_srgb_8unorm_to_linear_float(r); /* r */
         dst[1] = util_format_srgb_8unorm_to_linear_float(g); /* g */
         dst[2] = util_format_srgb_8unorm_to_linear_float(b); /* b */
         dst[3] = ubyte_to_float(a); /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_b8g8r8a8_srgb_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)(util_format_linear_float_to_srgb_8unorm(src[2])) << 24;
         value |= (uint32_t)((util_format_linear_float_to_srgb_8unorm(src[1])) & 0xff) << 16;
         value |= (uint32_t)((util_format_linear_float_to_srgb_8unorm(src[0])) & 0xff) << 8;
         value |= (float_to_ubyte(src[3])) & 0xff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (util_format_linear_float_to_srgb_8unorm(src[2])) & 0xff;
         value |= (uint32_t)((util_format_linear_float_to_srgb_8unorm(src[1])) & 0xff) << 8;
         value |= (uint32_t)((util_format_linear_float_to_srgb_8unorm(src[0])) & 0xff) << 16;
         value |= (uint32_t)(float_to_ubyte(src[3])) << 24;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_b8g8r8a8_srgb_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t b;
         uint32_t g;
         uint32_t r;
         uint32_t a;
         b = value >> 24;
         g = (value >> 16) & 0xff;
         r = (value >> 8) & 0xff;
         a = (value) & 0xff;
         dst[0] = util_format_srgb_8unorm_to_linear_float(r); /* r */
         dst[1] = util_format_srgb_8unorm_to_linear_float(g); /* g */
         dst[2] = util_format_srgb_8unorm_to_linear_float(b); /* b */
         dst[3] = ubyte_to_float(a); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t b;
         uint32_t g;
         uint32_t r;
         uint32_t a;
         b = (value) & 0xff;
         g = (value >> 8) & 0xff;
         r = (value >> 16) & 0xff;
         a = value >> 24;
         dst[0] = util_format_srgb_8unorm_to_linear_float(r); /* r */
         dst[1] = util_format_srgb_8unorm_to_linear_float(g); /* g */
         dst[2] = util_format_srgb_8unorm_to_linear_float(b); /* b */
         dst[3] = ubyte_to_float(a); /* a */
#endif
}

static inline void
util_format_b8g8r8a8_srgb_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t b;
         uint32_t g;
         uint32_t r;
         uint32_t a;
         b = value >> 24;
         g = (value >> 16) & 0xff;
         r = (value >> 8) & 0xff;
         a = (value) & 0xff;
         dst[0] = util_format_srgb_to_linear_8unorm(r); /* r */
         dst[1] = util_format_srgb_to_linear_8unorm(g); /* g */
         dst[2] = util_format_srgb_to_linear_8unorm(b); /* b */
         dst[3] = a; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t b;
         uint32_t g;
         uint32_t r;
         uint32_t a;
         b = (value) & 0xff;
         g = (value >> 8) & 0xff;
         r = (value >> 16) & 0xff;
         a = value >> 24;
         dst[0] = util_format_srgb_to_linear_8unorm(r); /* r */
         dst[1] = util_format_srgb_to_linear_8unorm(g); /* g */
         dst[2] = util_format_srgb_to_linear_8unorm(b); /* b */
         dst[3] = a; /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_b8g8r8a8_srgb_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)(util_format_linear_to_srgb_8unorm(src[2])) << 24;
         value |= (uint32_t)((util_format_linear_to_srgb_8unorm(src[1])) & 0xff) << 16;
         value |= (uint32_t)((util_format_linear_to_srgb_8unorm(src[0])) & 0xff) << 8;
         value |= (src[3]) & 0xff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (util_format_linear_to_srgb_8unorm(src[2])) & 0xff;
         value |= (uint32_t)((util_format_linear_to_srgb_8unorm(src[1])) & 0xff) << 8;
         value |= (uint32_t)((util_format_linear_to_srgb_8unorm(src[0])) & 0xff) << 16;
         value |= (uint32_t)(src[3]) << 24;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_b8g8r8x8_srgb_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t b;
         uint32_t g;
         uint32_t r;
         b = value >> 24;
         g = (value >> 16) & 0xff;
         r = (value >> 8) & 0xff;
         dst[0] = util_format_srgb_8unorm_to_linear_float(r); /* r */
         dst[1] = util_format_srgb_8unorm_to_linear_float(g); /* g */
         dst[2] = util_format_srgb_8unorm_to_linear_float(b); /* b */
         dst[3] = 1; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t b;
         uint32_t g;
         uint32_t r;
         b = (value) & 0xff;
         g = (value >> 8) & 0xff;
         r = (value >> 16) & 0xff;
         dst[0] = util_format_srgb_8unorm_to_linear_float(r); /* r */
         dst[1] = util_format_srgb_8unorm_to_linear_float(g); /* g */
         dst[2] = util_format_srgb_8unorm_to_linear_float(b); /* b */
         dst[3] = 1; /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_b8g8r8x8_srgb_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)(util_format_linear_float_to_srgb_8unorm(src[2])) << 24;
         value |= (uint32_t)((util_format_linear_float_to_srgb_8unorm(src[1])) & 0xff) << 16;
         value |= (uint32_t)((util_format_linear_float_to_srgb_8unorm(src[0])) & 0xff) << 8;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (util_format_linear_float_to_srgb_8unorm(src[2])) & 0xff;
         value |= (uint32_t)((util_format_linear_float_to_srgb_8unorm(src[1])) & 0xff) << 8;
         value |= (uint32_t)((util_format_linear_float_to_srgb_8unorm(src[0])) & 0xff) << 16;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_b8g8r8x8_srgb_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t b;
         uint32_t g;
         uint32_t r;
         b = value >> 24;
         g = (value >> 16) & 0xff;
         r = (value >> 8) & 0xff;
         dst[0] = util_format_srgb_8unorm_to_linear_float(r); /* r */
         dst[1] = util_format_srgb_8unorm_to_linear_float(g); /* g */
         dst[2] = util_format_srgb_8unorm_to_linear_float(b); /* b */
         dst[3] = 1; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t b;
         uint32_t g;
         uint32_t r;
         b = (value) & 0xff;
         g = (value >> 8) & 0xff;
         r = (value >> 16) & 0xff;
         dst[0] = util_format_srgb_8unorm_to_linear_float(r); /* r */
         dst[1] = util_format_srgb_8unorm_to_linear_float(g); /* g */
         dst[2] = util_format_srgb_8unorm_to_linear_float(b); /* b */
         dst[3] = 1; /* a */
#endif
}

static inline void
util_format_b8g8r8x8_srgb_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t b;
         uint32_t g;
         uint32_t r;
         b = value >> 24;
         g = (value >> 16) & 0xff;
         r = (value >> 8) & 0xff;
         dst[0] = util_format_srgb_to_linear_8unorm(r); /* r */
         dst[1] = util_format_srgb_to_linear_8unorm(g); /* g */
         dst[2] = util_format_srgb_to_linear_8unorm(b); /* b */
         dst[3] = 255; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t b;
         uint32_t g;
         uint32_t r;
         b = (value) & 0xff;
         g = (value >> 8) & 0xff;
         r = (value >> 16) & 0xff;
         dst[0] = util_format_srgb_to_linear_8unorm(r); /* r */
         dst[1] = util_format_srgb_to_linear_8unorm(g); /* g */
         dst[2] = util_format_srgb_to_linear_8unorm(b); /* b */
         dst[3] = 255; /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_b8g8r8x8_srgb_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)(util_format_linear_to_srgb_8unorm(src[2])) << 24;
         value |= (uint32_t)((util_format_linear_to_srgb_8unorm(src[1])) & 0xff) << 16;
         value |= (uint32_t)((util_format_linear_to_srgb_8unorm(src[0])) & 0xff) << 8;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (util_format_linear_to_srgb_8unorm(src[2])) & 0xff;
         value |= (uint32_t)((util_format_linear_to_srgb_8unorm(src[1])) & 0xff) << 8;
         value |= (uint32_t)((util_format_linear_to_srgb_8unorm(src[0])) & 0xff) << 16;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_a8r8g8b8_srgb_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t a;
         uint32_t r;
         uint32_t g;
         uint32_t b;
         a = value >> 24;
         r = (value >> 16) & 0xff;
         g = (value >> 8) & 0xff;
         b = (value) & 0xff;
         dst[0] = util_format_srgb_8unorm_to_linear_float(r); /* r */
         dst[1] = util_format_srgb_8unorm_to_linear_float(g); /* g */
         dst[2] = util_format_srgb_8unorm_to_linear_float(b); /* b */
         dst[3] = ubyte_to_float(a); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t a;
         uint32_t r;
         uint32_t g;
         uint32_t b;
         a = (value) & 0xff;
         r = (value >> 8) & 0xff;
         g = (value >> 16) & 0xff;
         b = value >> 24;
         dst[0] = util_format_srgb_8unorm_to_linear_float(r); /* r */
         dst[1] = util_format_srgb_8unorm_to_linear_float(g); /* g */
         dst[2] = util_format_srgb_8unorm_to_linear_float(b); /* b */
         dst[3] = ubyte_to_float(a); /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_a8r8g8b8_srgb_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)(float_to_ubyte(src[3])) << 24;
         value |= (uint32_t)((util_format_linear_float_to_srgb_8unorm(src[0])) & 0xff) << 16;
         value |= (uint32_t)((util_format_linear_float_to_srgb_8unorm(src[1])) & 0xff) << 8;
         value |= (util_format_linear_float_to_srgb_8unorm(src[2])) & 0xff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (float_to_ubyte(src[3])) & 0xff;
         value |= (uint32_t)((util_format_linear_float_to_srgb_8unorm(src[0])) & 0xff) << 8;
         value |= (uint32_t)((util_format_linear_float_to_srgb_8unorm(src[1])) & 0xff) << 16;
         value |= (uint32_t)(util_format_linear_float_to_srgb_8unorm(src[2])) << 24;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_a8r8g8b8_srgb_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t a;
         uint32_t r;
         uint32_t g;
         uint32_t b;
         a = value >> 24;
         r = (value >> 16) & 0xff;
         g = (value >> 8) & 0xff;
         b = (value) & 0xff;
         dst[0] = util_format_srgb_8unorm_to_linear_float(r); /* r */
         dst[1] = util_format_srgb_8unorm_to_linear_float(g); /* g */
         dst[2] = util_format_srgb_8unorm_to_linear_float(b); /* b */
         dst[3] = ubyte_to_float(a); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t a;
         uint32_t r;
         uint32_t g;
         uint32_t b;
         a = (value) & 0xff;
         r = (value >> 8) & 0xff;
         g = (value >> 16) & 0xff;
         b = value >> 24;
         dst[0] = util_format_srgb_8unorm_to_linear_float(r); /* r */
         dst[1] = util_format_srgb_8unorm_to_linear_float(g); /* g */
         dst[2] = util_format_srgb_8unorm_to_linear_float(b); /* b */
         dst[3] = ubyte_to_float(a); /* a */
#endif
}

static inline void
util_format_a8r8g8b8_srgb_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t a;
         uint32_t r;
         uint32_t g;
         uint32_t b;
         a = value >> 24;
         r = (value >> 16) & 0xff;
         g = (value >> 8) & 0xff;
         b = (value) & 0xff;
         dst[0] = util_format_srgb_to_linear_8unorm(r); /* r */
         dst[1] = util_format_srgb_to_linear_8unorm(g); /* g */
         dst[2] = util_format_srgb_to_linear_8unorm(b); /* b */
         dst[3] = a; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t a;
         uint32_t r;
         uint32_t g;
         uint32_t b;
         a = (value) & 0xff;
         r = (value >> 8) & 0xff;
         g = (value >> 16) & 0xff;
         b = value >> 24;
         dst[0] = util_format_srgb_to_linear_8unorm(r); /* r */
         dst[1] = util_format_srgb_to_linear_8unorm(g); /* g */
         dst[2] = util_format_srgb_to_linear_8unorm(b); /* b */
         dst[3] = a; /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_a8r8g8b8_srgb_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)(src[3]) << 24;
         value |= (uint32_t)((util_format_linear_to_srgb_8unorm(src[0])) & 0xff) << 16;
         value |= (uint32_t)((util_format_linear_to_srgb_8unorm(src[1])) & 0xff) << 8;
         value |= (util_format_linear_to_srgb_8unorm(src[2])) & 0xff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (src[3]) & 0xff;
         value |= (uint32_t)((util_format_linear_to_srgb_8unorm(src[0])) & 0xff) << 8;
         value |= (uint32_t)((util_format_linear_to_srgb_8unorm(src[1])) & 0xff) << 16;
         value |= (uint32_t)(util_format_linear_to_srgb_8unorm(src[2])) << 24;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_x8r8g8b8_srgb_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t r;
         uint32_t g;
         uint32_t b;
         r = (value >> 16) & 0xff;
         g = (value >> 8) & 0xff;
         b = (value) & 0xff;
         dst[0] = util_format_srgb_8unorm_to_linear_float(r); /* r */
         dst[1] = util_format_srgb_8unorm_to_linear_float(g); /* g */
         dst[2] = util_format_srgb_8unorm_to_linear_float(b); /* b */
         dst[3] = 1; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t r;
         uint32_t g;
         uint32_t b;
         r = (value >> 8) & 0xff;
         g = (value >> 16) & 0xff;
         b = value >> 24;
         dst[0] = util_format_srgb_8unorm_to_linear_float(r); /* r */
         dst[1] = util_format_srgb_8unorm_to_linear_float(g); /* g */
         dst[2] = util_format_srgb_8unorm_to_linear_float(b); /* b */
         dst[3] = 1; /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_x8r8g8b8_srgb_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((util_format_linear_float_to_srgb_8unorm(src[0])) & 0xff) << 16;
         value |= (uint32_t)((util_format_linear_float_to_srgb_8unorm(src[1])) & 0xff) << 8;
         value |= (util_format_linear_float_to_srgb_8unorm(src[2])) & 0xff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (uint32_t)((util_format_linear_float_to_srgb_8unorm(src[0])) & 0xff) << 8;
         value |= (uint32_t)((util_format_linear_float_to_srgb_8unorm(src[1])) & 0xff) << 16;
         value |= (uint32_t)(util_format_linear_float_to_srgb_8unorm(src[2])) << 24;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_x8r8g8b8_srgb_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t r;
         uint32_t g;
         uint32_t b;
         r = (value >> 16) & 0xff;
         g = (value >> 8) & 0xff;
         b = (value) & 0xff;
         dst[0] = util_format_srgb_8unorm_to_linear_float(r); /* r */
         dst[1] = util_format_srgb_8unorm_to_linear_float(g); /* g */
         dst[2] = util_format_srgb_8unorm_to_linear_float(b); /* b */
         dst[3] = 1; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t r;
         uint32_t g;
         uint32_t b;
         r = (value >> 8) & 0xff;
         g = (value >> 16) & 0xff;
         b = value >> 24;
         dst[0] = util_format_srgb_8unorm_to_linear_float(r); /* r */
         dst[1] = util_format_srgb_8unorm_to_linear_float(g); /* g */
         dst[2] = util_format_srgb_8unorm_to_linear_float(b); /* b */
         dst[3] = 1; /* a */
#endif
}

static inline void
util_format_x8r8g8b8_srgb_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t r;
         uint32_t g;
         uint32_t b;
         r = (value >> 16) & 0xff;
         g = (value >> 8) & 0xff;
         b = (value) & 0xff;
         dst[0] = util_format_srgb_to_linear_8unorm(r); /* r */
         dst[1] = util_format_srgb_to_linear_8unorm(g); /* g */
         dst[2] = util_format_srgb_to_linear_8unorm(b); /* b */
         dst[3] = 255; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t r;
         uint32_t g;
         uint32_t b;
         r = (value >> 8) & 0xff;
         g = (value >> 16) & 0xff;
         b = value >> 24;
         dst[0] = util_format_srgb_to_linear_8unorm(r); /* r */
         dst[1] = util_format_srgb_to_linear_8unorm(g); /* g */
         dst[2] = util_format_srgb_to_linear_8unorm(b); /* b */
         dst[3] = 255; /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_x8r8g8b8_srgb_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((util_format_linear_to_srgb_8unorm(src[0])) & 0xff) << 16;
         value |= (uint32_t)((util_format_linear_to_srgb_8unorm(src[1])) & 0xff) << 8;
         value |= (util_format_linear_to_srgb_8unorm(src[2])) & 0xff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (uint32_t)((util_format_linear_to_srgb_8unorm(src[0])) & 0xff) << 8;
         value |= (uint32_t)((util_format_linear_to_srgb_8unorm(src[1])) & 0xff) << 16;
         value |= (uint32_t)(util_format_linear_to_srgb_8unorm(src[2])) << 24;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r8sg8sb8ux8u_norm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t r;
         int32_t g;
         uint32_t b;
         r = ((int32_t)(value) ) >> 24;
         g = ((int32_t)(value << 8) ) >> 24;
         b = (value >> 8) & 0xff;
         dst[0] = (float)(r * (1.0f/0x7f)); /* r */
         dst[1] = (float)(g * (1.0f/0x7f)); /* g */
         dst[2] = ubyte_to_float(b); /* b */
         dst[3] = 1; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t r;
         int32_t g;
         uint32_t b;
         r = ((int32_t)(value << 24) ) >> 24;
         g = ((int32_t)(value << 16) ) >> 24;
         b = (value >> 16) & 0xff;
         dst[0] = (float)(r * (1.0f/0x7f)); /* r */
         dst[1] = (float)(g * (1.0f/0x7f)); /* g */
         dst[2] = ubyte_to_float(b); /* b */
         dst[3] = 1; /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r8sg8sb8ux8u_norm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)((uint32_t)util_iround(CLAMP(src[0], -1.0f, 1.0f) * 0x7f)) << 24) ;
         value |= (uint32_t)((uint32_t)(((uint32_t)util_iround(CLAMP(src[1], -1.0f, 1.0f) * 0x7f)) & 0xff) << 16) ;
         value |= (uint32_t)((float_to_ubyte(src[2])) & 0xff) << 8;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (uint32_t)(((uint32_t)util_iround(CLAMP(src[0], -1.0f, 1.0f) * 0x7f)) & 0xff) ;
         value |= (uint32_t)((uint32_t)(((uint32_t)util_iround(CLAMP(src[1], -1.0f, 1.0f) * 0x7f)) & 0xff) << 8) ;
         value |= (uint32_t)((float_to_ubyte(src[2])) & 0xff) << 16;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r8sg8sb8ux8u_norm_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t r;
         int32_t g;
         uint32_t b;
         r = ((int32_t)(value) ) >> 24;
         g = ((int32_t)(value << 8) ) >> 24;
         b = (value >> 8) & 0xff;
         dst[0] = (float)(r * (1.0f/0x7f)); /* r */
         dst[1] = (float)(g * (1.0f/0x7f)); /* g */
         dst[2] = ubyte_to_float(b); /* b */
         dst[3] = 1; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t r;
         int32_t g;
         uint32_t b;
         r = ((int32_t)(value << 24) ) >> 24;
         g = ((int32_t)(value << 16) ) >> 24;
         b = (value >> 16) & 0xff;
         dst[0] = (float)(r * (1.0f/0x7f)); /* r */
         dst[1] = (float)(g * (1.0f/0x7f)); /* g */
         dst[2] = ubyte_to_float(b); /* b */
         dst[3] = 1; /* a */
#endif
}

static inline void
util_format_r8sg8sb8ux8u_norm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t r;
         int32_t g;
         uint32_t b;
         r = ((int32_t)(value) ) >> 24;
         g = ((int32_t)(value << 8) ) >> 24;
         b = (value >> 8) & 0xff;
         dst[0] = (uint8_t)(((uint32_t)MAX2(r, 0)) * 0xff / 0x7f); /* r */
         dst[1] = (uint8_t)(((uint32_t)MAX2(g, 0)) * 0xff / 0x7f); /* g */
         dst[2] = b; /* b */
         dst[3] = 255; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t r;
         int32_t g;
         uint32_t b;
         r = ((int32_t)(value << 24) ) >> 24;
         g = ((int32_t)(value << 16) ) >> 24;
         b = (value >> 16) & 0xff;
         dst[0] = (uint8_t)(((uint32_t)MAX2(r, 0)) * 0xff / 0x7f); /* r */
         dst[1] = (uint8_t)(((uint32_t)MAX2(g, 0)) * 0xff / 0x7f); /* g */
         dst[2] = b; /* b */
         dst[3] = 255; /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r8sg8sb8ux8u_norm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)((uint32_t)(src[0] >> 1)) << 24) ;
         value |= (uint32_t)((uint32_t)(((uint32_t)(src[1] >> 1)) & 0xff) << 16) ;
         value |= (uint32_t)((src[2]) & 0xff) << 8;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (uint32_t)(((uint32_t)(src[0] >> 1)) & 0xff) ;
         value |= (uint32_t)((uint32_t)(((uint32_t)(src[1] >> 1)) & 0xff) << 8) ;
         value |= (uint32_t)((src[2]) & 0xff) << 16;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r10sg10sb10sa2u_norm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t a;
         int32_t b;
         int32_t g;
         int32_t r;
         a = value >> 30;
         b = ((int32_t)(value << 2) ) >> 22;
         g = ((int32_t)(value << 12) ) >> 22;
         r = ((int32_t)(value << 22) ) >> 22;
         dst[0] = (float)(r * (1.0f/0x1ff)); /* r */
         dst[1] = (float)(g * (1.0f/0x1ff)); /* g */
         dst[2] = (float)(b * (1.0f/0x1ff)); /* b */
         dst[3] = (float)(a * (1.0f/0x3)); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t r;
         int32_t g;
         int32_t b;
         uint32_t a;
         r = ((int32_t)(value << 22) ) >> 22;
         g = ((int32_t)(value << 12) ) >> 22;
         b = ((int32_t)(value << 2) ) >> 22;
         a = value >> 30;
         dst[0] = (float)(r * (1.0f/0x1ff)); /* r */
         dst[1] = (float)(g * (1.0f/0x1ff)); /* g */
         dst[2] = (float)(b * (1.0f/0x1ff)); /* b */
         dst[3] = (float)(a * (1.0f/0x3)); /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r10sg10sb10sa2u_norm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)util_iround(CLAMP(src[3], 0.0f, 1.0f) * 0x3)) << 30;
         value |= (uint32_t)((uint32_t)(((uint32_t)util_iround(CLAMP(src[2], -1.0f, 1.0f) * 0x1ff)) & 0x3ff) << 20) ;
         value |= (uint32_t)((uint32_t)(((uint32_t)util_iround(CLAMP(src[1], -1.0f, 1.0f) * 0x1ff)) & 0x3ff) << 10) ;
         value |= (uint32_t)(((uint32_t)util_iround(CLAMP(src[0], -1.0f, 1.0f) * 0x1ff)) & 0x3ff) ;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (uint32_t)(((uint32_t)util_iround(CLAMP(src[0], -1.0f, 1.0f) * 0x1ff)) & 0x3ff) ;
         value |= (uint32_t)((uint32_t)(((uint32_t)util_iround(CLAMP(src[1], -1.0f, 1.0f) * 0x1ff)) & 0x3ff) << 10) ;
         value |= (uint32_t)((uint32_t)(((uint32_t)util_iround(CLAMP(src[2], -1.0f, 1.0f) * 0x1ff)) & 0x3ff) << 20) ;
         value |= (uint32_t)((uint32_t)util_iround(CLAMP(src[3], 0.0f, 1.0f) * 0x3)) << 30;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r10sg10sb10sa2u_norm_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t a;
         int32_t b;
         int32_t g;
         int32_t r;
         a = value >> 30;
         b = ((int32_t)(value << 2) ) >> 22;
         g = ((int32_t)(value << 12) ) >> 22;
         r = ((int32_t)(value << 22) ) >> 22;
         dst[0] = (float)(r * (1.0f/0x1ff)); /* r */
         dst[1] = (float)(g * (1.0f/0x1ff)); /* g */
         dst[2] = (float)(b * (1.0f/0x1ff)); /* b */
         dst[3] = (float)(a * (1.0f/0x3)); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t r;
         int32_t g;
         int32_t b;
         uint32_t a;
         r = ((int32_t)(value << 22) ) >> 22;
         g = ((int32_t)(value << 12) ) >> 22;
         b = ((int32_t)(value << 2) ) >> 22;
         a = value >> 30;
         dst[0] = (float)(r * (1.0f/0x1ff)); /* r */
         dst[1] = (float)(g * (1.0f/0x1ff)); /* g */
         dst[2] = (float)(b * (1.0f/0x1ff)); /* b */
         dst[3] = (float)(a * (1.0f/0x3)); /* a */
#endif
}

static inline void
util_format_r10sg10sb10sa2u_norm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t a;
         int32_t b;
         int32_t g;
         int32_t r;
         a = value >> 30;
         b = ((int32_t)(value << 2) ) >> 22;
         g = ((int32_t)(value << 12) ) >> 22;
         r = ((int32_t)(value << 22) ) >> 22;
         dst[0] = (uint8_t)(MAX2(r, 0) >> 1); /* r */
         dst[1] = (uint8_t)(MAX2(g, 0) >> 1); /* g */
         dst[2] = (uint8_t)(MAX2(b, 0) >> 1); /* b */
         dst[3] = (uint8_t)(((uint32_t)a) * 0xff / 0x3); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t r;
         int32_t g;
         int32_t b;
         uint32_t a;
         r = ((int32_t)(value << 22) ) >> 22;
         g = ((int32_t)(value << 12) ) >> 22;
         b = ((int32_t)(value << 2) ) >> 22;
         a = value >> 30;
         dst[0] = (uint8_t)(MAX2(r, 0) >> 1); /* r */
         dst[1] = (uint8_t)(MAX2(g, 0) >> 1); /* g */
         dst[2] = (uint8_t)(MAX2(b, 0) >> 1); /* b */
         dst[3] = (uint8_t)(((uint32_t)a) * 0xff / 0x3); /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r10sg10sb10sa2u_norm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)(src[3] >> 6)) << 30;
         value |= (uint32_t)((uint32_t)(((uint32_t)(((uint32_t)src[2]) * 0x1ff / 0xff)) & 0x3ff) << 20) ;
         value |= (uint32_t)((uint32_t)(((uint32_t)(((uint32_t)src[1]) * 0x1ff / 0xff)) & 0x3ff) << 10) ;
         value |= (uint32_t)(((uint32_t)(((uint32_t)src[0]) * 0x1ff / 0xff)) & 0x3ff) ;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (uint32_t)(((uint32_t)(((uint32_t)src[0]) * 0x1ff / 0xff)) & 0x3ff) ;
         value |= (uint32_t)((uint32_t)(((uint32_t)(((uint32_t)src[1]) * 0x1ff / 0xff)) & 0x3ff) << 10) ;
         value |= (uint32_t)((uint32_t)(((uint32_t)(((uint32_t)src[2]) * 0x1ff / 0xff)) & 0x3ff) << 20) ;
         value |= (uint32_t)((uint32_t)(src[3] >> 6)) << 30;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r5sg5sb6u_norm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t b;
         int16_t g;
         int16_t r;
         b = value >> 10;
         g = ((int16_t)(value << 6) ) >> 11;
         r = ((int16_t)(value << 11) ) >> 11;
         dst[0] = (float)(r * (1.0f/0xf)); /* r */
         dst[1] = (float)(g * (1.0f/0xf)); /* g */
         dst[2] = (float)(b * (1.0f/0x3f)); /* b */
         dst[3] = 1; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         int16_t r;
         int16_t g;
         uint16_t b;
         r = ((int16_t)(value << 11) ) >> 11;
         g = ((int16_t)(value << 6) ) >> 11;
         b = value >> 10;
         dst[0] = (float)(r * (1.0f/0xf)); /* r */
         dst[1] = (float)(g * (1.0f/0xf)); /* g */
         dst[2] = (float)(b * (1.0f/0x3f)); /* b */
         dst[3] = 1; /* a */
#endif
         src += 2;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r5sg5sb6u_norm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)((uint16_t)util_iround(CLAMP(src[2], 0.0f, 1.0f) * 0x3f)) << 10;
         value |= (uint16_t)((uint32_t)(((uint16_t)util_iround(CLAMP(src[1], -1.0f, 1.0f) * 0xf)) & 0x1f) << 5) ;
         value |= (uint16_t)(((uint16_t)util_iround(CLAMP(src[0], -1.0f, 1.0f) * 0xf)) & 0x1f) ;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= (uint16_t)(((uint16_t)util_iround(CLAMP(src[0], -1.0f, 1.0f) * 0xf)) & 0x1f) ;
         value |= (uint16_t)((uint32_t)(((uint16_t)util_iround(CLAMP(src[1], -1.0f, 1.0f) * 0xf)) & 0x1f) << 5) ;
         value |= (uint32_t)((uint16_t)util_iround(CLAMP(src[2], 0.0f, 1.0f) * 0x3f)) << 10;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r5sg5sb6u_norm_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t b;
         int16_t g;
         int16_t r;
         b = value >> 10;
         g = ((int16_t)(value << 6) ) >> 11;
         r = ((int16_t)(value << 11) ) >> 11;
         dst[0] = (float)(r * (1.0f/0xf)); /* r */
         dst[1] = (float)(g * (1.0f/0xf)); /* g */
         dst[2] = (float)(b * (1.0f/0x3f)); /* b */
         dst[3] = 1; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         int16_t r;
         int16_t g;
         uint16_t b;
         r = ((int16_t)(value << 11) ) >> 11;
         g = ((int16_t)(value << 6) ) >> 11;
         b = value >> 10;
         dst[0] = (float)(r * (1.0f/0xf)); /* r */
         dst[1] = (float)(g * (1.0f/0xf)); /* g */
         dst[2] = (float)(b * (1.0f/0x3f)); /* b */
         dst[3] = 1; /* a */
#endif
}

static inline void
util_format_r5sg5sb6u_norm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t b;
         int16_t g;
         int16_t r;
         b = value >> 10;
         g = ((int16_t)(value << 6) ) >> 11;
         r = ((int16_t)(value << 11) ) >> 11;
         dst[0] = (uint8_t)(((uint32_t)MAX2(r, 0)) * 0xff / 0xf); /* r */
         dst[1] = (uint8_t)(((uint32_t)MAX2(g, 0)) * 0xff / 0xf); /* g */
         dst[2] = (uint8_t)(((uint32_t)b) * 0xff / 0x3f); /* b */
         dst[3] = 255; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         int16_t r;
         int16_t g;
         uint16_t b;
         r = ((int16_t)(value << 11) ) >> 11;
         g = ((int16_t)(value << 6) ) >> 11;
         b = value >> 10;
         dst[0] = (uint8_t)(((uint32_t)MAX2(r, 0)) * 0xff / 0xf); /* r */
         dst[1] = (uint8_t)(((uint32_t)MAX2(g, 0)) * 0xff / 0xf); /* g */
         dst[2] = (uint8_t)(((uint32_t)b) * 0xff / 0x3f); /* b */
         dst[3] = 255; /* a */
#endif
         src += 2;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r5sg5sb6u_norm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)((uint16_t)(src[2] >> 2)) << 10;
         value |= (uint16_t)((uint32_t)(((uint16_t)(src[1] >> 4)) & 0x1f) << 5) ;
         value |= (uint16_t)(((uint16_t)(src[0] >> 4)) & 0x1f) ;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= (uint16_t)(((uint16_t)(src[0] >> 4)) & 0x1f) ;
         value |= (uint16_t)((uint32_t)(((uint16_t)(src[1] >> 4)) & 0x1f) << 5) ;
         value |= (uint32_t)((uint16_t)(src[2] >> 2)) << 10;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_z24_unorm_s8_uint_as_r8g8b8a8_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t r;
         uint32_t g;
         uint32_t b;
         uint32_t a;
         r = value >> 24;
         g = (value >> 16) & 0xff;
         b = (value >> 8) & 0xff;
         a = (value) & 0xff;
         dst[0] = ubyte_to_float(r); /* r */
         dst[1] = ubyte_to_float(g); /* g */
         dst[2] = ubyte_to_float(b); /* b */
         dst[3] = ubyte_to_float(a); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t r;
         uint32_t g;
         uint32_t b;
         uint32_t a;
         r = (value) & 0xff;
         g = (value >> 8) & 0xff;
         b = (value >> 16) & 0xff;
         a = value >> 24;
         dst[0] = ubyte_to_float(r); /* r */
         dst[1] = ubyte_to_float(g); /* g */
         dst[2] = ubyte_to_float(b); /* b */
         dst[3] = ubyte_to_float(a); /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_z24_unorm_s8_uint_as_r8g8b8a8_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)(float_to_ubyte(src[0])) << 24;
         value |= (uint32_t)((float_to_ubyte(src[1])) & 0xff) << 16;
         value |= (uint32_t)((float_to_ubyte(src[2])) & 0xff) << 8;
         value |= (float_to_ubyte(src[3])) & 0xff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (float_to_ubyte(src[0])) & 0xff;
         value |= (uint32_t)((float_to_ubyte(src[1])) & 0xff) << 8;
         value |= (uint32_t)((float_to_ubyte(src[2])) & 0xff) << 16;
         value |= (uint32_t)(float_to_ubyte(src[3])) << 24;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_z24_unorm_s8_uint_as_r8g8b8a8_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t r;
         uint32_t g;
         uint32_t b;
         uint32_t a;
         r = value >> 24;
         g = (value >> 16) & 0xff;
         b = (value >> 8) & 0xff;
         a = (value) & 0xff;
         dst[0] = ubyte_to_float(r); /* r */
         dst[1] = ubyte_to_float(g); /* g */
         dst[2] = ubyte_to_float(b); /* b */
         dst[3] = ubyte_to_float(a); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t r;
         uint32_t g;
         uint32_t b;
         uint32_t a;
         r = (value) & 0xff;
         g = (value >> 8) & 0xff;
         b = (value >> 16) & 0xff;
         a = value >> 24;
         dst[0] = ubyte_to_float(r); /* r */
         dst[1] = ubyte_to_float(g); /* g */
         dst[2] = ubyte_to_float(b); /* b */
         dst[3] = ubyte_to_float(a); /* a */
#endif
}

static inline void
util_format_z24_unorm_s8_uint_as_r8g8b8a8_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t r;
         uint32_t g;
         uint32_t b;
         uint32_t a;
         r = value >> 24;
         g = (value >> 16) & 0xff;
         b = (value >> 8) & 0xff;
         a = (value) & 0xff;
         dst[0] = r; /* r */
         dst[1] = g; /* g */
         dst[2] = b; /* b */
         dst[3] = a; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t r;
         uint32_t g;
         uint32_t b;
         uint32_t a;
         r = (value) & 0xff;
         g = (value >> 8) & 0xff;
         b = (value >> 16) & 0xff;
         a = value >> 24;
         dst[0] = r; /* r */
         dst[1] = g; /* g */
         dst[2] = b; /* b */
         dst[3] = a; /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_z24_unorm_s8_uint_as_r8g8b8a8_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)(src[0]) << 24;
         value |= (uint32_t)((src[1]) & 0xff) << 16;
         value |= (uint32_t)((src[2]) & 0xff) << 8;
         value |= (src[3]) & 0xff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (src[0]) & 0xff;
         value |= (uint32_t)((src[1]) & 0xff) << 8;
         value |= (uint32_t)((src[2]) & 0xff) << 16;
         value |= (uint32_t)(src[3]) << 24;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r64_float {
   double r;
};

static inline void
util_format_r64_float_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         struct util_format_r64_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
         src += 8;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r64_float_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         struct util_format_r64_float pixel;
         pixel.r = (double)src[0];
         memcpy(dst, &pixel, sizeof pixel);
         src += 4;
         dst += 8;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r64_float_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
         struct util_format_r64_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
}

static inline void
util_format_r64_float_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         struct util_format_r64_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)util_iround(CLAMP(pixel.r, 0.0, 1.0) * 0xff); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 255; /* a */
         src += 8;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r64_float_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         struct util_format_r64_float pixel;
         pixel.r = (double)(src[0] * (1.0f/0xff));
         memcpy(dst, &pixel, sizeof pixel);
         src += 4;
         dst += 8;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r64g64_float {
#if UTIL_ARCH_BIG_ENDIAN
   double r;
   double g;
#else
   double r;
   double g;
#endif
};

static inline void
util_format_r64g64_float_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r64g64_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r64g64_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#endif
         src += 16;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r64g64_float_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r64g64_float pixel;
         pixel.r = (double)src[0];
         pixel.g = (double)src[1];
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r64g64_float pixel;
         pixel.r = (double)src[0];
         pixel.g = (double)src[1];
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 16;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r64g64_float_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r64g64_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r64g64_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#endif
}

static inline void
util_format_r64g64_float_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r64g64_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)util_iround(CLAMP(pixel.r, 0.0, 1.0) * 0xff); /* r */
         dst[1] = (uint8_t)util_iround(CLAMP(pixel.g, 0.0, 1.0) * 0xff); /* g */
         dst[2] = 0; /* b */
         dst[3] = 255; /* a */
#else
         struct util_format_r64g64_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)util_iround(CLAMP(pixel.r, 0.0, 1.0) * 0xff); /* r */
         dst[1] = (uint8_t)util_iround(CLAMP(pixel.g, 0.0, 1.0) * 0xff); /* g */
         dst[2] = 0; /* b */
         dst[3] = 255; /* a */
#endif
         src += 16;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r64g64_float_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r64g64_float pixel;
         pixel.r = (double)(src[0] * (1.0f/0xff));
         pixel.g = (double)(src[1] * (1.0f/0xff));
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r64g64_float pixel;
         pixel.r = (double)(src[0] * (1.0f/0xff));
         pixel.g = (double)(src[1] * (1.0f/0xff));
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 16;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r64g64b64_float {
#if UTIL_ARCH_BIG_ENDIAN
   double r;
   double g;
   double b;
#else
   double r;
   double g;
   double b;
#endif
};

static inline void
util_format_r64g64b64_float_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r64g64b64_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r64g64b64_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = 1; /* a */
#endif
         src += 24;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r64g64b64_float_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r64g64b64_float pixel;
         pixel.r = (double)src[0];
         pixel.g = (double)src[1];
         pixel.b = (double)src[2];
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r64g64b64_float pixel;
         pixel.r = (double)src[0];
         pixel.g = (double)src[1];
         pixel.b = (double)src[2];
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 24;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r64g64b64_float_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r64g64b64_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r64g64b64_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = 1; /* a */
#endif
}

static inline void
util_format_r64g64b64_float_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r64g64b64_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)util_iround(CLAMP(pixel.r, 0.0, 1.0) * 0xff); /* r */
         dst[1] = (uint8_t)util_iround(CLAMP(pixel.g, 0.0, 1.0) * 0xff); /* g */
         dst[2] = (uint8_t)util_iround(CLAMP(pixel.b, 0.0, 1.0) * 0xff); /* b */
         dst[3] = 255; /* a */
#else
         struct util_format_r64g64b64_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)util_iround(CLAMP(pixel.r, 0.0, 1.0) * 0xff); /* r */
         dst[1] = (uint8_t)util_iround(CLAMP(pixel.g, 0.0, 1.0) * 0xff); /* g */
         dst[2] = (uint8_t)util_iround(CLAMP(pixel.b, 0.0, 1.0) * 0xff); /* b */
         dst[3] = 255; /* a */
#endif
         src += 24;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r64g64b64_float_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r64g64b64_float pixel;
         pixel.r = (double)(src[0] * (1.0f/0xff));
         pixel.g = (double)(src[1] * (1.0f/0xff));
         pixel.b = (double)(src[2] * (1.0f/0xff));
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r64g64b64_float pixel;
         pixel.r = (double)(src[0] * (1.0f/0xff));
         pixel.g = (double)(src[1] * (1.0f/0xff));
         pixel.b = (double)(src[2] * (1.0f/0xff));
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 24;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r64g64b64a64_float {
#if UTIL_ARCH_BIG_ENDIAN
   double r;
   double g;
   double b;
   double a;
#else
   double r;
   double g;
   double b;
   double a;
#endif
};

static inline void
util_format_r64g64b64a64_float_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r64g64b64a64_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = (float)pixel.a; /* a */
#else
         struct util_format_r64g64b64a64_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = (float)pixel.a; /* a */
#endif
         src += 32;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r64g64b64a64_float_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r64g64b64a64_float pixel;
         pixel.r = (double)src[0];
         pixel.g = (double)src[1];
         pixel.b = (double)src[2];
         pixel.a = (double)src[3];
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r64g64b64a64_float pixel;
         pixel.r = (double)src[0];
         pixel.g = (double)src[1];
         pixel.b = (double)src[2];
         pixel.a = (double)src[3];
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 32;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r64g64b64a64_float_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r64g64b64a64_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = (float)pixel.a; /* a */
#else
         struct util_format_r64g64b64a64_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = (float)pixel.a; /* a */
#endif
}

static inline void
util_format_r64g64b64a64_float_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r64g64b64a64_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)util_iround(CLAMP(pixel.r, 0.0, 1.0) * 0xff); /* r */
         dst[1] = (uint8_t)util_iround(CLAMP(pixel.g, 0.0, 1.0) * 0xff); /* g */
         dst[2] = (uint8_t)util_iround(CLAMP(pixel.b, 0.0, 1.0) * 0xff); /* b */
         dst[3] = (uint8_t)util_iround(CLAMP(pixel.a, 0.0, 1.0) * 0xff); /* a */
#else
         struct util_format_r64g64b64a64_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)util_iround(CLAMP(pixel.r, 0.0, 1.0) * 0xff); /* r */
         dst[1] = (uint8_t)util_iround(CLAMP(pixel.g, 0.0, 1.0) * 0xff); /* g */
         dst[2] = (uint8_t)util_iround(CLAMP(pixel.b, 0.0, 1.0) * 0xff); /* b */
         dst[3] = (uint8_t)util_iround(CLAMP(pixel.a, 0.0, 1.0) * 0xff); /* a */
#endif
         src += 32;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r64g64b64a64_float_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r64g64b64a64_float pixel;
         pixel.r = (double)(src[0] * (1.0f/0xff));
         pixel.g = (double)(src[1] * (1.0f/0xff));
         pixel.b = (double)(src[2] * (1.0f/0xff));
         pixel.a = (double)(src[3] * (1.0f/0xff));
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r64g64b64a64_float pixel;
         pixel.r = (double)(src[0] * (1.0f/0xff));
         pixel.g = (double)(src[1] * (1.0f/0xff));
         pixel.b = (double)(src[2] * (1.0f/0xff));
         pixel.a = (double)(src[3] * (1.0f/0xff));
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 32;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r32_float {
   float r;
};

static inline void
util_format_r32_float_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         struct util_format_r32_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.r; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r32_float_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         struct util_format_r32_float pixel;
         pixel.r = src[0];
         memcpy(dst, &pixel, sizeof pixel);
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r32_float_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
         struct util_format_r32_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.r; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
}

static inline void
util_format_r32_float_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         struct util_format_r32_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = float_to_ubyte(pixel.r); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 255; /* a */
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r32_float_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         struct util_format_r32_float pixel;
         pixel.r = ubyte_to_float(src[0]);
         memcpy(dst, &pixel, sizeof pixel);
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r32g32_float {
#if UTIL_ARCH_BIG_ENDIAN
   float r;
   float g;
#else
   float r;
   float g;
#endif
};

static inline void
util_format_r32g32_float_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.r; /* r */
         dst[1] = pixel.g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r32g32_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.r; /* r */
         dst[1] = pixel.g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#endif
         src += 8;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r32g32_float_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32_float pixel;
         pixel.r = src[0];
         pixel.g = src[1];
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r32g32_float pixel;
         pixel.r = src[0];
         pixel.g = src[1];
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 8;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r32g32_float_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.r; /* r */
         dst[1] = pixel.g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r32g32_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.r; /* r */
         dst[1] = pixel.g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#endif
}

static inline void
util_format_r32g32_float_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = float_to_ubyte(pixel.r); /* r */
         dst[1] = float_to_ubyte(pixel.g); /* g */
         dst[2] = 0; /* b */
         dst[3] = 255; /* a */
#else
         struct util_format_r32g32_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = float_to_ubyte(pixel.r); /* r */
         dst[1] = float_to_ubyte(pixel.g); /* g */
         dst[2] = 0; /* b */
         dst[3] = 255; /* a */
#endif
         src += 8;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r32g32_float_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32_float pixel;
         pixel.r = ubyte_to_float(src[0]);
         pixel.g = ubyte_to_float(src[1]);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r32g32_float pixel;
         pixel.r = ubyte_to_float(src[0]);
         pixel.g = ubyte_to_float(src[1]);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 8;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r32g32b32_float {
#if UTIL_ARCH_BIG_ENDIAN
   float r;
   float g;
   float b;
#else
   float r;
   float g;
   float b;
#endif
};

static inline void
util_format_r32g32b32_float_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.r; /* r */
         dst[1] = pixel.g; /* g */
         dst[2] = pixel.b; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r32g32b32_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.r; /* r */
         dst[1] = pixel.g; /* g */
         dst[2] = pixel.b; /* b */
         dst[3] = 1; /* a */
#endif
         src += 12;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r32g32b32_float_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32_float pixel;
         pixel.r = src[0];
         pixel.g = src[1];
         pixel.b = src[2];
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r32g32b32_float pixel;
         pixel.r = src[0];
         pixel.g = src[1];
         pixel.b = src[2];
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 12;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r32g32b32_float_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.r; /* r */
         dst[1] = pixel.g; /* g */
         dst[2] = pixel.b; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r32g32b32_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.r; /* r */
         dst[1] = pixel.g; /* g */
         dst[2] = pixel.b; /* b */
         dst[3] = 1; /* a */
#endif
}

static inline void
util_format_r32g32b32_float_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = float_to_ubyte(pixel.r); /* r */
         dst[1] = float_to_ubyte(pixel.g); /* g */
         dst[2] = float_to_ubyte(pixel.b); /* b */
         dst[3] = 255; /* a */
#else
         struct util_format_r32g32b32_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = float_to_ubyte(pixel.r); /* r */
         dst[1] = float_to_ubyte(pixel.g); /* g */
         dst[2] = float_to_ubyte(pixel.b); /* b */
         dst[3] = 255; /* a */
#endif
         src += 12;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r32g32b32_float_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32_float pixel;
         pixel.r = ubyte_to_float(src[0]);
         pixel.g = ubyte_to_float(src[1]);
         pixel.b = ubyte_to_float(src[2]);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r32g32b32_float pixel;
         pixel.r = ubyte_to_float(src[0]);
         pixel.g = ubyte_to_float(src[1]);
         pixel.b = ubyte_to_float(src[2]);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 12;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r32g32b32a32_float {
#if UTIL_ARCH_BIG_ENDIAN
   float r;
   float g;
   float b;
   float a;
#else
   float r;
   float g;
   float b;
   float a;
#endif
};

static inline void
util_format_r32g32b32a32_float_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32a32_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.r; /* r */
         dst[1] = pixel.g; /* g */
         dst[2] = pixel.b; /* b */
         dst[3] = pixel.a; /* a */
#else
         struct util_format_r32g32b32a32_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.r; /* r */
         dst[1] = pixel.g; /* g */
         dst[2] = pixel.b; /* b */
         dst[3] = pixel.a; /* a */
#endif
         src += 16;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r32g32b32a32_float_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32a32_float pixel;
         pixel.r = src[0];
         pixel.g = src[1];
         pixel.b = src[2];
         pixel.a = src[3];
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r32g32b32a32_float pixel;
         pixel.r = src[0];
         pixel.g = src[1];
         pixel.b = src[2];
         pixel.a = src[3];
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 16;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r32g32b32a32_float_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32a32_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.r; /* r */
         dst[1] = pixel.g; /* g */
         dst[2] = pixel.b; /* b */
         dst[3] = pixel.a; /* a */
#else
         struct util_format_r32g32b32a32_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.r; /* r */
         dst[1] = pixel.g; /* g */
         dst[2] = pixel.b; /* b */
         dst[3] = pixel.a; /* a */
#endif
}

static inline void
util_format_r32g32b32a32_float_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32a32_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = float_to_ubyte(pixel.r); /* r */
         dst[1] = float_to_ubyte(pixel.g); /* g */
         dst[2] = float_to_ubyte(pixel.b); /* b */
         dst[3] = float_to_ubyte(pixel.a); /* a */
#else
         struct util_format_r32g32b32a32_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = float_to_ubyte(pixel.r); /* r */
         dst[1] = float_to_ubyte(pixel.g); /* g */
         dst[2] = float_to_ubyte(pixel.b); /* b */
         dst[3] = float_to_ubyte(pixel.a); /* a */
#endif
         src += 16;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r32g32b32a32_float_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32a32_float pixel;
         pixel.r = ubyte_to_float(src[0]);
         pixel.g = ubyte_to_float(src[1]);
         pixel.b = ubyte_to_float(src[2]);
         pixel.a = ubyte_to_float(src[3]);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r32g32b32a32_float pixel;
         pixel.r = ubyte_to_float(src[0]);
         pixel.g = ubyte_to_float(src[1]);
         pixel.b = ubyte_to_float(src[2]);
         pixel.a = ubyte_to_float(src[3]);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 16;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r32_unorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         uint32_t value = *(const uint32_t *)src;
         uint32_t r;
         r = value;
         dst[0] = (float)(r * (1.0/0xffffffff)); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r32_unorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint32_t value = 0;
         value |= (uint32_t)(CLAMP(src[0], 0.0f, 1.0f) * (double)0xffffffff);
         *(uint32_t *)dst = value;
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r32_unorm_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
         uint32_t value = *(const uint32_t *)src;
         uint32_t r;
         r = value;
         dst[0] = (float)(r * (1.0/0xffffffff)); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
}

static inline void
util_format_r32_unorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         uint32_t value = *(const uint32_t *)src;
         uint32_t r;
         r = value;
         dst[0] = (uint8_t)(r >> 24); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 255; /* a */
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r32_unorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint32_t value = 0;
         value |= (uint32_t)(((uint64_t)src[0]) * 0xffffffff / 0xff);
         *(uint32_t *)dst = value;
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r32g32_unorm {
#if UTIL_ARCH_BIG_ENDIAN
   uint32_t r;
   uint32_t g;
#else
   uint32_t r;
   uint32_t g;
#endif
};

static inline void
util_format_r32g32_unorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32_unorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0/0xffffffff)); /* r */
         dst[1] = (float)(pixel.g * (1.0/0xffffffff)); /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r32g32_unorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0/0xffffffff)); /* r */
         dst[1] = (float)(pixel.g * (1.0/0xffffffff)); /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#endif
         src += 8;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r32g32_unorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32_unorm pixel;
         pixel.r = (uint32_t)(CLAMP(src[0], 0.0f, 1.0f) * (double)0xffffffff);
         pixel.g = (uint32_t)(CLAMP(src[1], 0.0f, 1.0f) * (double)0xffffffff);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r32g32_unorm pixel;
         pixel.r = (uint32_t)(CLAMP(src[0], 0.0f, 1.0f) * (double)0xffffffff);
         pixel.g = (uint32_t)(CLAMP(src[1], 0.0f, 1.0f) * (double)0xffffffff);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 8;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r32g32_unorm_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32_unorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0/0xffffffff)); /* r */
         dst[1] = (float)(pixel.g * (1.0/0xffffffff)); /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r32g32_unorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0/0xffffffff)); /* r */
         dst[1] = (float)(pixel.g * (1.0/0xffffffff)); /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#endif
}

static inline void
util_format_r32g32_unorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32_unorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)(pixel.r >> 24); /* r */
         dst[1] = (uint8_t)(pixel.g >> 24); /* g */
         dst[2] = 0; /* b */
         dst[3] = 255; /* a */
#else
         struct util_format_r32g32_unorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)(pixel.r >> 24); /* r */
         dst[1] = (uint8_t)(pixel.g >> 24); /* g */
         dst[2] = 0; /* b */
         dst[3] = 255; /* a */
#endif
         src += 8;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r32g32_unorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32_unorm pixel;
         pixel.r = (uint32_t)(((uint64_t)src[0]) * 0xffffffff / 0xff);
         pixel.g = (uint32_t)(((uint64_t)src[1]) * 0xffffffff / 0xff);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r32g32_unorm pixel;
         pixel.r = (uint32_t)(((uint64_t)src[0]) * 0xffffffff / 0xff);
         pixel.g = (uint32_t)(((uint64_t)src[1]) * 0xffffffff / 0xff);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 8;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r32g32b32_unorm {
#if UTIL_ARCH_BIG_ENDIAN
   uint32_t r;
   uint32_t g;
   uint32_t b;
#else
   uint32_t r;
   uint32_t g;
   uint32_t b;
#endif
};

static inline void
util_format_r32g32b32_unorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32_unorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0/0xffffffff)); /* r */
         dst[1] = (float)(pixel.g * (1.0/0xffffffff)); /* g */
         dst[2] = (float)(pixel.b * (1.0/0xffffffff)); /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r32g32b32_unorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0/0xffffffff)); /* r */
         dst[1] = (float)(pixel.g * (1.0/0xffffffff)); /* g */
         dst[2] = (float)(pixel.b * (1.0/0xffffffff)); /* b */
         dst[3] = 1; /* a */
#endif
         src += 12;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r32g32b32_unorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32_unorm pixel;
         pixel.r = (uint32_t)(CLAMP(src[0], 0.0f, 1.0f) * (double)0xffffffff);
         pixel.g = (uint32_t)(CLAMP(src[1], 0.0f, 1.0f) * (double)0xffffffff);
         pixel.b = (uint32_t)(CLAMP(src[2], 0.0f, 1.0f) * (double)0xffffffff);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r32g32b32_unorm pixel;
         pixel.r = (uint32_t)(CLAMP(src[0], 0.0f, 1.0f) * (double)0xffffffff);
         pixel.g = (uint32_t)(CLAMP(src[1], 0.0f, 1.0f) * (double)0xffffffff);
         pixel.b = (uint32_t)(CLAMP(src[2], 0.0f, 1.0f) * (double)0xffffffff);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 12;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r32g32b32_unorm_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32_unorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0/0xffffffff)); /* r */
         dst[1] = (float)(pixel.g * (1.0/0xffffffff)); /* g */
         dst[2] = (float)(pixel.b * (1.0/0xffffffff)); /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r32g32b32_unorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0/0xffffffff)); /* r */
         dst[1] = (float)(pixel.g * (1.0/0xffffffff)); /* g */
         dst[2] = (float)(pixel.b * (1.0/0xffffffff)); /* b */
         dst[3] = 1; /* a */
#endif
}

static inline void
util_format_r32g32b32_unorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32_unorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)(pixel.r >> 24); /* r */
         dst[1] = (uint8_t)(pixel.g >> 24); /* g */
         dst[2] = (uint8_t)(pixel.b >> 24); /* b */
         dst[3] = 255; /* a */
#else
         struct util_format_r32g32b32_unorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)(pixel.r >> 24); /* r */
         dst[1] = (uint8_t)(pixel.g >> 24); /* g */
         dst[2] = (uint8_t)(pixel.b >> 24); /* b */
         dst[3] = 255; /* a */
#endif
         src += 12;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r32g32b32_unorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32_unorm pixel;
         pixel.r = (uint32_t)(((uint64_t)src[0]) * 0xffffffff / 0xff);
         pixel.g = (uint32_t)(((uint64_t)src[1]) * 0xffffffff / 0xff);
         pixel.b = (uint32_t)(((uint64_t)src[2]) * 0xffffffff / 0xff);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r32g32b32_unorm pixel;
         pixel.r = (uint32_t)(((uint64_t)src[0]) * 0xffffffff / 0xff);
         pixel.g = (uint32_t)(((uint64_t)src[1]) * 0xffffffff / 0xff);
         pixel.b = (uint32_t)(((uint64_t)src[2]) * 0xffffffff / 0xff);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 12;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r32g32b32a32_unorm {
#if UTIL_ARCH_BIG_ENDIAN
   uint32_t r;
   uint32_t g;
   uint32_t b;
   uint32_t a;
#else
   uint32_t r;
   uint32_t g;
   uint32_t b;
   uint32_t a;
#endif
};

static inline void
util_format_r32g32b32a32_unorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32a32_unorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0/0xffffffff)); /* r */
         dst[1] = (float)(pixel.g * (1.0/0xffffffff)); /* g */
         dst[2] = (float)(pixel.b * (1.0/0xffffffff)); /* b */
         dst[3] = (float)(pixel.a * (1.0/0xffffffff)); /* a */
#else
         struct util_format_r32g32b32a32_unorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0/0xffffffff)); /* r */
         dst[1] = (float)(pixel.g * (1.0/0xffffffff)); /* g */
         dst[2] = (float)(pixel.b * (1.0/0xffffffff)); /* b */
         dst[3] = (float)(pixel.a * (1.0/0xffffffff)); /* a */
#endif
         src += 16;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r32g32b32a32_unorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32a32_unorm pixel;
         pixel.r = (uint32_t)(CLAMP(src[0], 0.0f, 1.0f) * (double)0xffffffff);
         pixel.g = (uint32_t)(CLAMP(src[1], 0.0f, 1.0f) * (double)0xffffffff);
         pixel.b = (uint32_t)(CLAMP(src[2], 0.0f, 1.0f) * (double)0xffffffff);
         pixel.a = (uint32_t)(CLAMP(src[3], 0.0f, 1.0f) * (double)0xffffffff);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r32g32b32a32_unorm pixel;
         pixel.r = (uint32_t)(CLAMP(src[0], 0.0f, 1.0f) * (double)0xffffffff);
         pixel.g = (uint32_t)(CLAMP(src[1], 0.0f, 1.0f) * (double)0xffffffff);
         pixel.b = (uint32_t)(CLAMP(src[2], 0.0f, 1.0f) * (double)0xffffffff);
         pixel.a = (uint32_t)(CLAMP(src[3], 0.0f, 1.0f) * (double)0xffffffff);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 16;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r32g32b32a32_unorm_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32a32_unorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0/0xffffffff)); /* r */
         dst[1] = (float)(pixel.g * (1.0/0xffffffff)); /* g */
         dst[2] = (float)(pixel.b * (1.0/0xffffffff)); /* b */
         dst[3] = (float)(pixel.a * (1.0/0xffffffff)); /* a */
#else
         struct util_format_r32g32b32a32_unorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0/0xffffffff)); /* r */
         dst[1] = (float)(pixel.g * (1.0/0xffffffff)); /* g */
         dst[2] = (float)(pixel.b * (1.0/0xffffffff)); /* b */
         dst[3] = (float)(pixel.a * (1.0/0xffffffff)); /* a */
#endif
}

static inline void
util_format_r32g32b32a32_unorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32a32_unorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)(pixel.r >> 24); /* r */
         dst[1] = (uint8_t)(pixel.g >> 24); /* g */
         dst[2] = (uint8_t)(pixel.b >> 24); /* b */
         dst[3] = (uint8_t)(pixel.a >> 24); /* a */
#else
         struct util_format_r32g32b32a32_unorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)(pixel.r >> 24); /* r */
         dst[1] = (uint8_t)(pixel.g >> 24); /* g */
         dst[2] = (uint8_t)(pixel.b >> 24); /* b */
         dst[3] = (uint8_t)(pixel.a >> 24); /* a */
#endif
         src += 16;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r32g32b32a32_unorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32a32_unorm pixel;
         pixel.r = (uint32_t)(((uint64_t)src[0]) * 0xffffffff / 0xff);
         pixel.g = (uint32_t)(((uint64_t)src[1]) * 0xffffffff / 0xff);
         pixel.b = (uint32_t)(((uint64_t)src[2]) * 0xffffffff / 0xff);
         pixel.a = (uint32_t)(((uint64_t)src[3]) * 0xffffffff / 0xff);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r32g32b32a32_unorm pixel;
         pixel.r = (uint32_t)(((uint64_t)src[0]) * 0xffffffff / 0xff);
         pixel.g = (uint32_t)(((uint64_t)src[1]) * 0xffffffff / 0xff);
         pixel.b = (uint32_t)(((uint64_t)src[2]) * 0xffffffff / 0xff);
         pixel.a = (uint32_t)(((uint64_t)src[3]) * 0xffffffff / 0xff);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 16;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r32_uscaled_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         uint32_t value = *(const uint32_t *)src;
         uint32_t r;
         r = value;
         dst[0] = (float)r; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r32_uscaled_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint32_t value = 0;
         value |= (uint32_t)CLAMP(src[0], 0.0f, 4294967040.0f);
         *(uint32_t *)dst = value;
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r32_uscaled_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
         uint32_t value = *(const uint32_t *)src;
         uint32_t r;
         r = value;
         dst[0] = (float)r; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
}

static inline void
util_format_r32_uscaled_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         uint32_t value = *(const uint32_t *)src;
         uint32_t r;
         r = value;
         dst[0] = (uint8_t)(((uint64_t)MIN2(r, 1)) * 0xff / 0x1); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 255; /* a */
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r32_uscaled_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint32_t value = 0;
         value |= (uint32_t)(((uint64_t)src[0]) * 0x1 / 0xff);
         *(uint32_t *)dst = value;
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r32g32_uscaled {
#if UTIL_ARCH_BIG_ENDIAN
   uint32_t r;
   uint32_t g;
#else
   uint32_t r;
   uint32_t g;
#endif
};

static inline void
util_format_r32g32_uscaled_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32_uscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r32g32_uscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#endif
         src += 8;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r32g32_uscaled_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32_uscaled pixel;
         pixel.r = (uint32_t)CLAMP(src[0], 0.0f, 4294967040.0f);
         pixel.g = (uint32_t)CLAMP(src[1], 0.0f, 4294967040.0f);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r32g32_uscaled pixel;
         pixel.r = (uint32_t)CLAMP(src[0], 0.0f, 4294967040.0f);
         pixel.g = (uint32_t)CLAMP(src[1], 0.0f, 4294967040.0f);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 8;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r32g32_uscaled_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32_uscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r32g32_uscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#endif
}

static inline void
util_format_r32g32_uscaled_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32_uscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)(((uint64_t)MIN2(pixel.r, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint64_t)MIN2(pixel.g, 1)) * 0xff / 0x1); /* g */
         dst[2] = 0; /* b */
         dst[3] = 255; /* a */
#else
         struct util_format_r32g32_uscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)(((uint64_t)MIN2(pixel.r, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint64_t)MIN2(pixel.g, 1)) * 0xff / 0x1); /* g */
         dst[2] = 0; /* b */
         dst[3] = 255; /* a */
#endif
         src += 8;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r32g32_uscaled_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32_uscaled pixel;
         pixel.r = (uint32_t)(((uint64_t)src[0]) * 0x1 / 0xff);
         pixel.g = (uint32_t)(((uint64_t)src[1]) * 0x1 / 0xff);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r32g32_uscaled pixel;
         pixel.r = (uint32_t)(((uint64_t)src[0]) * 0x1 / 0xff);
         pixel.g = (uint32_t)(((uint64_t)src[1]) * 0x1 / 0xff);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 8;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r32g32b32_uscaled {
#if UTIL_ARCH_BIG_ENDIAN
   uint32_t r;
   uint32_t g;
   uint32_t b;
#else
   uint32_t r;
   uint32_t g;
   uint32_t b;
#endif
};

static inline void
util_format_r32g32b32_uscaled_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32_uscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r32g32b32_uscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = 1; /* a */
#endif
         src += 12;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r32g32b32_uscaled_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32_uscaled pixel;
         pixel.r = (uint32_t)CLAMP(src[0], 0.0f, 4294967040.0f);
         pixel.g = (uint32_t)CLAMP(src[1], 0.0f, 4294967040.0f);
         pixel.b = (uint32_t)CLAMP(src[2], 0.0f, 4294967040.0f);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r32g32b32_uscaled pixel;
         pixel.r = (uint32_t)CLAMP(src[0], 0.0f, 4294967040.0f);
         pixel.g = (uint32_t)CLAMP(src[1], 0.0f, 4294967040.0f);
         pixel.b = (uint32_t)CLAMP(src[2], 0.0f, 4294967040.0f);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 12;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r32g32b32_uscaled_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32_uscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r32g32b32_uscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = 1; /* a */
#endif
}

static inline void
util_format_r32g32b32_uscaled_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32_uscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)(((uint64_t)MIN2(pixel.r, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint64_t)MIN2(pixel.g, 1)) * 0xff / 0x1); /* g */
         dst[2] = (uint8_t)(((uint64_t)MIN2(pixel.b, 1)) * 0xff / 0x1); /* b */
         dst[3] = 255; /* a */
#else
         struct util_format_r32g32b32_uscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)(((uint64_t)MIN2(pixel.r, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint64_t)MIN2(pixel.g, 1)) * 0xff / 0x1); /* g */
         dst[2] = (uint8_t)(((uint64_t)MIN2(pixel.b, 1)) * 0xff / 0x1); /* b */
         dst[3] = 255; /* a */
#endif
         src += 12;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r32g32b32_uscaled_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32_uscaled pixel;
         pixel.r = (uint32_t)(((uint64_t)src[0]) * 0x1 / 0xff);
         pixel.g = (uint32_t)(((uint64_t)src[1]) * 0x1 / 0xff);
         pixel.b = (uint32_t)(((uint64_t)src[2]) * 0x1 / 0xff);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r32g32b32_uscaled pixel;
         pixel.r = (uint32_t)(((uint64_t)src[0]) * 0x1 / 0xff);
         pixel.g = (uint32_t)(((uint64_t)src[1]) * 0x1 / 0xff);
         pixel.b = (uint32_t)(((uint64_t)src[2]) * 0x1 / 0xff);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 12;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r32g32b32a32_uscaled {
#if UTIL_ARCH_BIG_ENDIAN
   uint32_t r;
   uint32_t g;
   uint32_t b;
   uint32_t a;
#else
   uint32_t r;
   uint32_t g;
   uint32_t b;
   uint32_t a;
#endif
};

static inline void
util_format_r32g32b32a32_uscaled_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32a32_uscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = (float)pixel.a; /* a */
#else
         struct util_format_r32g32b32a32_uscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = (float)pixel.a; /* a */
#endif
         src += 16;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r32g32b32a32_uscaled_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32a32_uscaled pixel;
         pixel.r = (uint32_t)CLAMP(src[0], 0.0f, 4294967040.0f);
         pixel.g = (uint32_t)CLAMP(src[1], 0.0f, 4294967040.0f);
         pixel.b = (uint32_t)CLAMP(src[2], 0.0f, 4294967040.0f);
         pixel.a = (uint32_t)CLAMP(src[3], 0.0f, 4294967040.0f);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r32g32b32a32_uscaled pixel;
         pixel.r = (uint32_t)CLAMP(src[0], 0.0f, 4294967040.0f);
         pixel.g = (uint32_t)CLAMP(src[1], 0.0f, 4294967040.0f);
         pixel.b = (uint32_t)CLAMP(src[2], 0.0f, 4294967040.0f);
         pixel.a = (uint32_t)CLAMP(src[3], 0.0f, 4294967040.0f);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 16;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r32g32b32a32_uscaled_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32a32_uscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = (float)pixel.a; /* a */
#else
         struct util_format_r32g32b32a32_uscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = (float)pixel.a; /* a */
#endif
}

static inline void
util_format_r32g32b32a32_uscaled_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32a32_uscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)(((uint64_t)MIN2(pixel.r, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint64_t)MIN2(pixel.g, 1)) * 0xff / 0x1); /* g */
         dst[2] = (uint8_t)(((uint64_t)MIN2(pixel.b, 1)) * 0xff / 0x1); /* b */
         dst[3] = (uint8_t)(((uint64_t)MIN2(pixel.a, 1)) * 0xff / 0x1); /* a */
#else
         struct util_format_r32g32b32a32_uscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)(((uint64_t)MIN2(pixel.r, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint64_t)MIN2(pixel.g, 1)) * 0xff / 0x1); /* g */
         dst[2] = (uint8_t)(((uint64_t)MIN2(pixel.b, 1)) * 0xff / 0x1); /* b */
         dst[3] = (uint8_t)(((uint64_t)MIN2(pixel.a, 1)) * 0xff / 0x1); /* a */
#endif
         src += 16;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r32g32b32a32_uscaled_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32a32_uscaled pixel;
         pixel.r = (uint32_t)(((uint64_t)src[0]) * 0x1 / 0xff);
         pixel.g = (uint32_t)(((uint64_t)src[1]) * 0x1 / 0xff);
         pixel.b = (uint32_t)(((uint64_t)src[2]) * 0x1 / 0xff);
         pixel.a = (uint32_t)(((uint64_t)src[3]) * 0x1 / 0xff);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r32g32b32a32_uscaled pixel;
         pixel.r = (uint32_t)(((uint64_t)src[0]) * 0x1 / 0xff);
         pixel.g = (uint32_t)(((uint64_t)src[1]) * 0x1 / 0xff);
         pixel.b = (uint32_t)(((uint64_t)src[2]) * 0x1 / 0xff);
         pixel.a = (uint32_t)(((uint64_t)src[3]) * 0x1 / 0xff);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 16;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r32_snorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         uint32_t value = *(const uint32_t *)src;
         int32_t r;
         r = (int32_t)(value) ;
         dst[0] = (float)(r * (1.0/0x7fffffff)); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r32_snorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint32_t value = 0;
         value |= (uint32_t)((int32_t)(CLAMP(src[0], -1.0f, 1.0f) * (double)0x7fffffff)) ;
         *(uint32_t *)dst = value;
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r32_snorm_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
         uint32_t value = *(const uint32_t *)src;
         int32_t r;
         r = (int32_t)(value) ;
         dst[0] = (float)(r * (1.0/0x7fffffff)); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
}

static inline void
util_format_r32_snorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         uint32_t value = *(const uint32_t *)src;
         int32_t r;
         r = (int32_t)(value) ;
         dst[0] = (uint8_t)(MAX2(r, 0) >> 23); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 255; /* a */
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r32_snorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint32_t value = 0;
         value |= (uint32_t)((int32_t)(((uint64_t)src[0]) * 0x7fffffff / 0xff)) ;
         *(uint32_t *)dst = value;
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r32g32_snorm {
#if UTIL_ARCH_BIG_ENDIAN
   int32_t r;
   int32_t g;
#else
   int32_t r;
   int32_t g;
#endif
};

static inline void
util_format_r32g32_snorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32_snorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0/0x7fffffff)); /* r */
         dst[1] = (float)(pixel.g * (1.0/0x7fffffff)); /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r32g32_snorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0/0x7fffffff)); /* r */
         dst[1] = (float)(pixel.g * (1.0/0x7fffffff)); /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#endif
         src += 8;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r32g32_snorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32_snorm pixel;
         pixel.r = (int32_t)(CLAMP(src[0], -1.0f, 1.0f) * (double)0x7fffffff);
         pixel.g = (int32_t)(CLAMP(src[1], -1.0f, 1.0f) * (double)0x7fffffff);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r32g32_snorm pixel;
         pixel.r = (int32_t)(CLAMP(src[0], -1.0f, 1.0f) * (double)0x7fffffff);
         pixel.g = (int32_t)(CLAMP(src[1], -1.0f, 1.0f) * (double)0x7fffffff);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 8;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r32g32_snorm_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32_snorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0/0x7fffffff)); /* r */
         dst[1] = (float)(pixel.g * (1.0/0x7fffffff)); /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r32g32_snorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0/0x7fffffff)); /* r */
         dst[1] = (float)(pixel.g * (1.0/0x7fffffff)); /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#endif
}

static inline void
util_format_r32g32_snorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32_snorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)(MAX2(pixel.r, 0) >> 23); /* r */
         dst[1] = (uint8_t)(MAX2(pixel.g, 0) >> 23); /* g */
         dst[2] = 0; /* b */
         dst[3] = 255; /* a */
#else
         struct util_format_r32g32_snorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)(MAX2(pixel.r, 0) >> 23); /* r */
         dst[1] = (uint8_t)(MAX2(pixel.g, 0) >> 23); /* g */
         dst[2] = 0; /* b */
         dst[3] = 255; /* a */
#endif
         src += 8;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r32g32_snorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32_snorm pixel;
         pixel.r = (int32_t)(((uint64_t)src[0]) * 0x7fffffff / 0xff);
         pixel.g = (int32_t)(((uint64_t)src[1]) * 0x7fffffff / 0xff);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r32g32_snorm pixel;
         pixel.r = (int32_t)(((uint64_t)src[0]) * 0x7fffffff / 0xff);
         pixel.g = (int32_t)(((uint64_t)src[1]) * 0x7fffffff / 0xff);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 8;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r32g32b32_snorm {
#if UTIL_ARCH_BIG_ENDIAN
   int32_t r;
   int32_t g;
   int32_t b;
#else
   int32_t r;
   int32_t g;
   int32_t b;
#endif
};

static inline void
util_format_r32g32b32_snorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32_snorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0/0x7fffffff)); /* r */
         dst[1] = (float)(pixel.g * (1.0/0x7fffffff)); /* g */
         dst[2] = (float)(pixel.b * (1.0/0x7fffffff)); /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r32g32b32_snorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0/0x7fffffff)); /* r */
         dst[1] = (float)(pixel.g * (1.0/0x7fffffff)); /* g */
         dst[2] = (float)(pixel.b * (1.0/0x7fffffff)); /* b */
         dst[3] = 1; /* a */
#endif
         src += 12;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r32g32b32_snorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32_snorm pixel;
         pixel.r = (int32_t)(CLAMP(src[0], -1.0f, 1.0f) * (double)0x7fffffff);
         pixel.g = (int32_t)(CLAMP(src[1], -1.0f, 1.0f) * (double)0x7fffffff);
         pixel.b = (int32_t)(CLAMP(src[2], -1.0f, 1.0f) * (double)0x7fffffff);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r32g32b32_snorm pixel;
         pixel.r = (int32_t)(CLAMP(src[0], -1.0f, 1.0f) * (double)0x7fffffff);
         pixel.g = (int32_t)(CLAMP(src[1], -1.0f, 1.0f) * (double)0x7fffffff);
         pixel.b = (int32_t)(CLAMP(src[2], -1.0f, 1.0f) * (double)0x7fffffff);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 12;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r32g32b32_snorm_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32_snorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0/0x7fffffff)); /* r */
         dst[1] = (float)(pixel.g * (1.0/0x7fffffff)); /* g */
         dst[2] = (float)(pixel.b * (1.0/0x7fffffff)); /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r32g32b32_snorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0/0x7fffffff)); /* r */
         dst[1] = (float)(pixel.g * (1.0/0x7fffffff)); /* g */
         dst[2] = (float)(pixel.b * (1.0/0x7fffffff)); /* b */
         dst[3] = 1; /* a */
#endif
}

static inline void
util_format_r32g32b32_snorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32_snorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)(MAX2(pixel.r, 0) >> 23); /* r */
         dst[1] = (uint8_t)(MAX2(pixel.g, 0) >> 23); /* g */
         dst[2] = (uint8_t)(MAX2(pixel.b, 0) >> 23); /* b */
         dst[3] = 255; /* a */
#else
         struct util_format_r32g32b32_snorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)(MAX2(pixel.r, 0) >> 23); /* r */
         dst[1] = (uint8_t)(MAX2(pixel.g, 0) >> 23); /* g */
         dst[2] = (uint8_t)(MAX2(pixel.b, 0) >> 23); /* b */
         dst[3] = 255; /* a */
#endif
         src += 12;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r32g32b32_snorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32_snorm pixel;
         pixel.r = (int32_t)(((uint64_t)src[0]) * 0x7fffffff / 0xff);
         pixel.g = (int32_t)(((uint64_t)src[1]) * 0x7fffffff / 0xff);
         pixel.b = (int32_t)(((uint64_t)src[2]) * 0x7fffffff / 0xff);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r32g32b32_snorm pixel;
         pixel.r = (int32_t)(((uint64_t)src[0]) * 0x7fffffff / 0xff);
         pixel.g = (int32_t)(((uint64_t)src[1]) * 0x7fffffff / 0xff);
         pixel.b = (int32_t)(((uint64_t)src[2]) * 0x7fffffff / 0xff);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 12;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r32g32b32a32_snorm {
#if UTIL_ARCH_BIG_ENDIAN
   int32_t r;
   int32_t g;
   int32_t b;
   int32_t a;
#else
   int32_t r;
   int32_t g;
   int32_t b;
   int32_t a;
#endif
};

static inline void
util_format_r32g32b32a32_snorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32a32_snorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0/0x7fffffff)); /* r */
         dst[1] = (float)(pixel.g * (1.0/0x7fffffff)); /* g */
         dst[2] = (float)(pixel.b * (1.0/0x7fffffff)); /* b */
         dst[3] = (float)(pixel.a * (1.0/0x7fffffff)); /* a */
#else
         struct util_format_r32g32b32a32_snorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0/0x7fffffff)); /* r */
         dst[1] = (float)(pixel.g * (1.0/0x7fffffff)); /* g */
         dst[2] = (float)(pixel.b * (1.0/0x7fffffff)); /* b */
         dst[3] = (float)(pixel.a * (1.0/0x7fffffff)); /* a */
#endif
         src += 16;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r32g32b32a32_snorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32a32_snorm pixel;
         pixel.r = (int32_t)(CLAMP(src[0], -1.0f, 1.0f) * (double)0x7fffffff);
         pixel.g = (int32_t)(CLAMP(src[1], -1.0f, 1.0f) * (double)0x7fffffff);
         pixel.b = (int32_t)(CLAMP(src[2], -1.0f, 1.0f) * (double)0x7fffffff);
         pixel.a = (int32_t)(CLAMP(src[3], -1.0f, 1.0f) * (double)0x7fffffff);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r32g32b32a32_snorm pixel;
         pixel.r = (int32_t)(CLAMP(src[0], -1.0f, 1.0f) * (double)0x7fffffff);
         pixel.g = (int32_t)(CLAMP(src[1], -1.0f, 1.0f) * (double)0x7fffffff);
         pixel.b = (int32_t)(CLAMP(src[2], -1.0f, 1.0f) * (double)0x7fffffff);
         pixel.a = (int32_t)(CLAMP(src[3], -1.0f, 1.0f) * (double)0x7fffffff);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 16;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r32g32b32a32_snorm_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32a32_snorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0/0x7fffffff)); /* r */
         dst[1] = (float)(pixel.g * (1.0/0x7fffffff)); /* g */
         dst[2] = (float)(pixel.b * (1.0/0x7fffffff)); /* b */
         dst[3] = (float)(pixel.a * (1.0/0x7fffffff)); /* a */
#else
         struct util_format_r32g32b32a32_snorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0/0x7fffffff)); /* r */
         dst[1] = (float)(pixel.g * (1.0/0x7fffffff)); /* g */
         dst[2] = (float)(pixel.b * (1.0/0x7fffffff)); /* b */
         dst[3] = (float)(pixel.a * (1.0/0x7fffffff)); /* a */
#endif
}

static inline void
util_format_r32g32b32a32_snorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32a32_snorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)(MAX2(pixel.r, 0) >> 23); /* r */
         dst[1] = (uint8_t)(MAX2(pixel.g, 0) >> 23); /* g */
         dst[2] = (uint8_t)(MAX2(pixel.b, 0) >> 23); /* b */
         dst[3] = (uint8_t)(MAX2(pixel.a, 0) >> 23); /* a */
#else
         struct util_format_r32g32b32a32_snorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)(MAX2(pixel.r, 0) >> 23); /* r */
         dst[1] = (uint8_t)(MAX2(pixel.g, 0) >> 23); /* g */
         dst[2] = (uint8_t)(MAX2(pixel.b, 0) >> 23); /* b */
         dst[3] = (uint8_t)(MAX2(pixel.a, 0) >> 23); /* a */
#endif
         src += 16;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r32g32b32a32_snorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32a32_snorm pixel;
         pixel.r = (int32_t)(((uint64_t)src[0]) * 0x7fffffff / 0xff);
         pixel.g = (int32_t)(((uint64_t)src[1]) * 0x7fffffff / 0xff);
         pixel.b = (int32_t)(((uint64_t)src[2]) * 0x7fffffff / 0xff);
         pixel.a = (int32_t)(((uint64_t)src[3]) * 0x7fffffff / 0xff);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r32g32b32a32_snorm pixel;
         pixel.r = (int32_t)(((uint64_t)src[0]) * 0x7fffffff / 0xff);
         pixel.g = (int32_t)(((uint64_t)src[1]) * 0x7fffffff / 0xff);
         pixel.b = (int32_t)(((uint64_t)src[2]) * 0x7fffffff / 0xff);
         pixel.a = (int32_t)(((uint64_t)src[3]) * 0x7fffffff / 0xff);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 16;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r32_sscaled_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         uint32_t value = *(const uint32_t *)src;
         int32_t r;
         r = (int32_t)(value) ;
         dst[0] = (float)r; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r32_sscaled_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint32_t value = 0;
         value |= (uint32_t)((int32_t)CLAMP(src[0], -2147483648.0f, 2147483520.0f)) ;
         *(uint32_t *)dst = value;
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r32_sscaled_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
         uint32_t value = *(const uint32_t *)src;
         int32_t r;
         r = (int32_t)(value) ;
         dst[0] = (float)r; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
}

static inline void
util_format_r32_sscaled_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         uint32_t value = *(const uint32_t *)src;
         int32_t r;
         r = (int32_t)(value) ;
         dst[0] = (uint8_t)(((uint64_t)CLAMP(r, 0, 1)) * 0xff / 0x1); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 255; /* a */
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r32_sscaled_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint32_t value = 0;
         value |= (uint32_t)((int32_t)(((uint64_t)src[0]) * 0x1 / 0xff)) ;
         *(uint32_t *)dst = value;
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r32g32_sscaled {
#if UTIL_ARCH_BIG_ENDIAN
   int32_t r;
   int32_t g;
#else
   int32_t r;
   int32_t g;
#endif
};

static inline void
util_format_r32g32_sscaled_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32_sscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r32g32_sscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#endif
         src += 8;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r32g32_sscaled_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32_sscaled pixel;
         pixel.r = (int32_t)CLAMP(src[0], -2147483648.0f, 2147483520.0f);
         pixel.g = (int32_t)CLAMP(src[1], -2147483648.0f, 2147483520.0f);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r32g32_sscaled pixel;
         pixel.r = (int32_t)CLAMP(src[0], -2147483648.0f, 2147483520.0f);
         pixel.g = (int32_t)CLAMP(src[1], -2147483648.0f, 2147483520.0f);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 8;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r32g32_sscaled_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32_sscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r32g32_sscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#endif
}

static inline void
util_format_r32g32_sscaled_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32_sscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)(((uint64_t)CLAMP(pixel.r, 0, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint64_t)CLAMP(pixel.g, 0, 1)) * 0xff / 0x1); /* g */
         dst[2] = 0; /* b */
         dst[3] = 255; /* a */
#else
         struct util_format_r32g32_sscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)(((uint64_t)CLAMP(pixel.r, 0, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint64_t)CLAMP(pixel.g, 0, 1)) * 0xff / 0x1); /* g */
         dst[2] = 0; /* b */
         dst[3] = 255; /* a */
#endif
         src += 8;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r32g32_sscaled_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32_sscaled pixel;
         pixel.r = (int32_t)(((uint64_t)src[0]) * 0x1 / 0xff);
         pixel.g = (int32_t)(((uint64_t)src[1]) * 0x1 / 0xff);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r32g32_sscaled pixel;
         pixel.r = (int32_t)(((uint64_t)src[0]) * 0x1 / 0xff);
         pixel.g = (int32_t)(((uint64_t)src[1]) * 0x1 / 0xff);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 8;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r32g32b32_sscaled {
#if UTIL_ARCH_BIG_ENDIAN
   int32_t r;
   int32_t g;
   int32_t b;
#else
   int32_t r;
   int32_t g;
   int32_t b;
#endif
};

static inline void
util_format_r32g32b32_sscaled_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32_sscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r32g32b32_sscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = 1; /* a */
#endif
         src += 12;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r32g32b32_sscaled_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32_sscaled pixel;
         pixel.r = (int32_t)CLAMP(src[0], -2147483648.0f, 2147483520.0f);
         pixel.g = (int32_t)CLAMP(src[1], -2147483648.0f, 2147483520.0f);
         pixel.b = (int32_t)CLAMP(src[2], -2147483648.0f, 2147483520.0f);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r32g32b32_sscaled pixel;
         pixel.r = (int32_t)CLAMP(src[0], -2147483648.0f, 2147483520.0f);
         pixel.g = (int32_t)CLAMP(src[1], -2147483648.0f, 2147483520.0f);
         pixel.b = (int32_t)CLAMP(src[2], -2147483648.0f, 2147483520.0f);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 12;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r32g32b32_sscaled_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32_sscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r32g32b32_sscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = 1; /* a */
#endif
}

static inline void
util_format_r32g32b32_sscaled_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32_sscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)(((uint64_t)CLAMP(pixel.r, 0, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint64_t)CLAMP(pixel.g, 0, 1)) * 0xff / 0x1); /* g */
         dst[2] = (uint8_t)(((uint64_t)CLAMP(pixel.b, 0, 1)) * 0xff / 0x1); /* b */
         dst[3] = 255; /* a */
#else
         struct util_format_r32g32b32_sscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)(((uint64_t)CLAMP(pixel.r, 0, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint64_t)CLAMP(pixel.g, 0, 1)) * 0xff / 0x1); /* g */
         dst[2] = (uint8_t)(((uint64_t)CLAMP(pixel.b, 0, 1)) * 0xff / 0x1); /* b */
         dst[3] = 255; /* a */
#endif
         src += 12;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r32g32b32_sscaled_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32_sscaled pixel;
         pixel.r = (int32_t)(((uint64_t)src[0]) * 0x1 / 0xff);
         pixel.g = (int32_t)(((uint64_t)src[1]) * 0x1 / 0xff);
         pixel.b = (int32_t)(((uint64_t)src[2]) * 0x1 / 0xff);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r32g32b32_sscaled pixel;
         pixel.r = (int32_t)(((uint64_t)src[0]) * 0x1 / 0xff);
         pixel.g = (int32_t)(((uint64_t)src[1]) * 0x1 / 0xff);
         pixel.b = (int32_t)(((uint64_t)src[2]) * 0x1 / 0xff);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 12;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r32g32b32a32_sscaled {
#if UTIL_ARCH_BIG_ENDIAN
   int32_t r;
   int32_t g;
   int32_t b;
   int32_t a;
#else
   int32_t r;
   int32_t g;
   int32_t b;
   int32_t a;
#endif
};

static inline void
util_format_r32g32b32a32_sscaled_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32a32_sscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = (float)pixel.a; /* a */
#else
         struct util_format_r32g32b32a32_sscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = (float)pixel.a; /* a */
#endif
         src += 16;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r32g32b32a32_sscaled_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32a32_sscaled pixel;
         pixel.r = (int32_t)CLAMP(src[0], -2147483648.0f, 2147483520.0f);
         pixel.g = (int32_t)CLAMP(src[1], -2147483648.0f, 2147483520.0f);
         pixel.b = (int32_t)CLAMP(src[2], -2147483648.0f, 2147483520.0f);
         pixel.a = (int32_t)CLAMP(src[3], -2147483648.0f, 2147483520.0f);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r32g32b32a32_sscaled pixel;
         pixel.r = (int32_t)CLAMP(src[0], -2147483648.0f, 2147483520.0f);
         pixel.g = (int32_t)CLAMP(src[1], -2147483648.0f, 2147483520.0f);
         pixel.b = (int32_t)CLAMP(src[2], -2147483648.0f, 2147483520.0f);
         pixel.a = (int32_t)CLAMP(src[3], -2147483648.0f, 2147483520.0f);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 16;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r32g32b32a32_sscaled_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32a32_sscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = (float)pixel.a; /* a */
#else
         struct util_format_r32g32b32a32_sscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = (float)pixel.a; /* a */
#endif
}

static inline void
util_format_r32g32b32a32_sscaled_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32a32_sscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)(((uint64_t)CLAMP(pixel.r, 0, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint64_t)CLAMP(pixel.g, 0, 1)) * 0xff / 0x1); /* g */
         dst[2] = (uint8_t)(((uint64_t)CLAMP(pixel.b, 0, 1)) * 0xff / 0x1); /* b */
         dst[3] = (uint8_t)(((uint64_t)CLAMP(pixel.a, 0, 1)) * 0xff / 0x1); /* a */
#else
         struct util_format_r32g32b32a32_sscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)(((uint64_t)CLAMP(pixel.r, 0, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint64_t)CLAMP(pixel.g, 0, 1)) * 0xff / 0x1); /* g */
         dst[2] = (uint8_t)(((uint64_t)CLAMP(pixel.b, 0, 1)) * 0xff / 0x1); /* b */
         dst[3] = (uint8_t)(((uint64_t)CLAMP(pixel.a, 0, 1)) * 0xff / 0x1); /* a */
#endif
         src += 16;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r32g32b32a32_sscaled_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32a32_sscaled pixel;
         pixel.r = (int32_t)(((uint64_t)src[0]) * 0x1 / 0xff);
         pixel.g = (int32_t)(((uint64_t)src[1]) * 0x1 / 0xff);
         pixel.b = (int32_t)(((uint64_t)src[2]) * 0x1 / 0xff);
         pixel.a = (int32_t)(((uint64_t)src[3]) * 0x1 / 0xff);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r32g32b32a32_sscaled pixel;
         pixel.r = (int32_t)(((uint64_t)src[0]) * 0x1 / 0xff);
         pixel.g = (int32_t)(((uint64_t)src[1]) * 0x1 / 0xff);
         pixel.b = (int32_t)(((uint64_t)src[2]) * 0x1 / 0xff);
         pixel.a = (int32_t)(((uint64_t)src[3]) * 0x1 / 0xff);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 16;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r16_float {
   uint16_t r;
};

static inline void
util_format_r16_float_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         struct util_format_r16_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = util_half_to_float(pixel.r); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
         src += 2;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r16_float_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         struct util_format_r16_float pixel;
         pixel.r = util_float_to_half(src[0]);
         memcpy(dst, &pixel, sizeof pixel);
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r16_float_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
         struct util_format_r16_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = util_half_to_float(pixel.r); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
}

static inline void
util_format_r16_float_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         struct util_format_r16_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = float_to_ubyte(util_half_to_float(pixel.r)); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 255; /* a */
         src += 2;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r16_float_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         struct util_format_r16_float pixel;
         pixel.r = util_float_to_half((float)(src[0] * (1.0f/0xff)));
         memcpy(dst, &pixel, sizeof pixel);
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r16g16_float {
#if UTIL_ARCH_BIG_ENDIAN
   uint16_t r;
   uint16_t g;
#else
   uint16_t r;
   uint16_t g;
#endif
};

static inline void
util_format_r16g16_float_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = util_half_to_float(pixel.r); /* r */
         dst[1] = util_half_to_float(pixel.g); /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r16g16_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = util_half_to_float(pixel.r); /* r */
         dst[1] = util_half_to_float(pixel.g); /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r16g16_float_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16_float pixel;
         pixel.r = util_float_to_half(src[0]);
         pixel.g = util_float_to_half(src[1]);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r16g16_float pixel;
         pixel.r = util_float_to_half(src[0]);
         pixel.g = util_float_to_half(src[1]);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r16g16_float_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = util_half_to_float(pixel.r); /* r */
         dst[1] = util_half_to_float(pixel.g); /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r16g16_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = util_half_to_float(pixel.r); /* r */
         dst[1] = util_half_to_float(pixel.g); /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#endif
}

static inline void
util_format_r16g16_float_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = float_to_ubyte(util_half_to_float(pixel.r)); /* r */
         dst[1] = float_to_ubyte(util_half_to_float(pixel.g)); /* g */
         dst[2] = 0; /* b */
         dst[3] = 255; /* a */
#else
         struct util_format_r16g16_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = float_to_ubyte(util_half_to_float(pixel.r)); /* r */
         dst[1] = float_to_ubyte(util_half_to_float(pixel.g)); /* g */
         dst[2] = 0; /* b */
         dst[3] = 255; /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r16g16_float_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16_float pixel;
         pixel.r = util_float_to_half((float)(src[0] * (1.0f/0xff)));
         pixel.g = util_float_to_half((float)(src[1] * (1.0f/0xff)));
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r16g16_float pixel;
         pixel.r = util_float_to_half((float)(src[0] * (1.0f/0xff)));
         pixel.g = util_float_to_half((float)(src[1] * (1.0f/0xff)));
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r16g16b16_float {
#if UTIL_ARCH_BIG_ENDIAN
   uint16_t r;
   uint16_t g;
   uint16_t b;
#else
   uint16_t r;
   uint16_t g;
   uint16_t b;
#endif
};

static inline void
util_format_r16g16b16_float_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = util_half_to_float(pixel.r); /* r */
         dst[1] = util_half_to_float(pixel.g); /* g */
         dst[2] = util_half_to_float(pixel.b); /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r16g16b16_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = util_half_to_float(pixel.r); /* r */
         dst[1] = util_half_to_float(pixel.g); /* g */
         dst[2] = util_half_to_float(pixel.b); /* b */
         dst[3] = 1; /* a */
#endif
         src += 6;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r16g16b16_float_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16_float pixel;
         pixel.r = util_float_to_half(src[0]);
         pixel.g = util_float_to_half(src[1]);
         pixel.b = util_float_to_half(src[2]);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r16g16b16_float pixel;
         pixel.r = util_float_to_half(src[0]);
         pixel.g = util_float_to_half(src[1]);
         pixel.b = util_float_to_half(src[2]);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 6;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r16g16b16_float_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = util_half_to_float(pixel.r); /* r */
         dst[1] = util_half_to_float(pixel.g); /* g */
         dst[2] = util_half_to_float(pixel.b); /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r16g16b16_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = util_half_to_float(pixel.r); /* r */
         dst[1] = util_half_to_float(pixel.g); /* g */
         dst[2] = util_half_to_float(pixel.b); /* b */
         dst[3] = 1; /* a */
#endif
}

static inline void
util_format_r16g16b16_float_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = float_to_ubyte(util_half_to_float(pixel.r)); /* r */
         dst[1] = float_to_ubyte(util_half_to_float(pixel.g)); /* g */
         dst[2] = float_to_ubyte(util_half_to_float(pixel.b)); /* b */
         dst[3] = 255; /* a */
#else
         struct util_format_r16g16b16_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = float_to_ubyte(util_half_to_float(pixel.r)); /* r */
         dst[1] = float_to_ubyte(util_half_to_float(pixel.g)); /* g */
         dst[2] = float_to_ubyte(util_half_to_float(pixel.b)); /* b */
         dst[3] = 255; /* a */
#endif
         src += 6;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r16g16b16_float_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16_float pixel;
         pixel.r = util_float_to_half((float)(src[0] * (1.0f/0xff)));
         pixel.g = util_float_to_half((float)(src[1] * (1.0f/0xff)));
         pixel.b = util_float_to_half((float)(src[2] * (1.0f/0xff)));
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r16g16b16_float pixel;
         pixel.r = util_float_to_half((float)(src[0] * (1.0f/0xff)));
         pixel.g = util_float_to_half((float)(src[1] * (1.0f/0xff)));
         pixel.b = util_float_to_half((float)(src[2] * (1.0f/0xff)));
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 6;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r16g16b16a16_float {
#if UTIL_ARCH_BIG_ENDIAN
   uint16_t r;
   uint16_t g;
   uint16_t b;
   uint16_t a;
#else
   uint16_t r;
   uint16_t g;
   uint16_t b;
   uint16_t a;
#endif
};

static inline void
util_format_r16g16b16a16_float_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16a16_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = util_half_to_float(pixel.r); /* r */
         dst[1] = util_half_to_float(pixel.g); /* g */
         dst[2] = util_half_to_float(pixel.b); /* b */
         dst[3] = util_half_to_float(pixel.a); /* a */
#else
         struct util_format_r16g16b16a16_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = util_half_to_float(pixel.r); /* r */
         dst[1] = util_half_to_float(pixel.g); /* g */
         dst[2] = util_half_to_float(pixel.b); /* b */
         dst[3] = util_half_to_float(pixel.a); /* a */
#endif
         src += 8;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r16g16b16a16_float_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16a16_float pixel;
         pixel.r = util_float_to_half(src[0]);
         pixel.g = util_float_to_half(src[1]);
         pixel.b = util_float_to_half(src[2]);
         pixel.a = util_float_to_half(src[3]);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r16g16b16a16_float pixel;
         pixel.r = util_float_to_half(src[0]);
         pixel.g = util_float_to_half(src[1]);
         pixel.b = util_float_to_half(src[2]);
         pixel.a = util_float_to_half(src[3]);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 8;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r16g16b16a16_float_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16a16_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = util_half_to_float(pixel.r); /* r */
         dst[1] = util_half_to_float(pixel.g); /* g */
         dst[2] = util_half_to_float(pixel.b); /* b */
         dst[3] = util_half_to_float(pixel.a); /* a */
#else
         struct util_format_r16g16b16a16_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = util_half_to_float(pixel.r); /* r */
         dst[1] = util_half_to_float(pixel.g); /* g */
         dst[2] = util_half_to_float(pixel.b); /* b */
         dst[3] = util_half_to_float(pixel.a); /* a */
#endif
}

static inline void
util_format_r16g16b16a16_float_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16a16_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = float_to_ubyte(util_half_to_float(pixel.r)); /* r */
         dst[1] = float_to_ubyte(util_half_to_float(pixel.g)); /* g */
         dst[2] = float_to_ubyte(util_half_to_float(pixel.b)); /* b */
         dst[3] = float_to_ubyte(util_half_to_float(pixel.a)); /* a */
#else
         struct util_format_r16g16b16a16_float pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = float_to_ubyte(util_half_to_float(pixel.r)); /* r */
         dst[1] = float_to_ubyte(util_half_to_float(pixel.g)); /* g */
         dst[2] = float_to_ubyte(util_half_to_float(pixel.b)); /* b */
         dst[3] = float_to_ubyte(util_half_to_float(pixel.a)); /* a */
#endif
         src += 8;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r16g16b16a16_float_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16a16_float pixel;
         pixel.r = util_float_to_half((float)(src[0] * (1.0f/0xff)));
         pixel.g = util_float_to_half((float)(src[1] * (1.0f/0xff)));
         pixel.b = util_float_to_half((float)(src[2] * (1.0f/0xff)));
         pixel.a = util_float_to_half((float)(src[3] * (1.0f/0xff)));
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r16g16b16a16_float pixel;
         pixel.r = util_float_to_half((float)(src[0] * (1.0f/0xff)));
         pixel.g = util_float_to_half((float)(src[1] * (1.0f/0xff)));
         pixel.b = util_float_to_half((float)(src[2] * (1.0f/0xff)));
         pixel.a = util_float_to_half((float)(src[3] * (1.0f/0xff)));
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 8;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r16_unorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = *(const uint16_t *)src;
         uint16_t r;
         r = value;
         dst[0] = (float)(r * (1.0f/0xffff)); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
         src += 2;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r16_unorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = 0;
         value |= (uint16_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0xffff);
         *(uint16_t *)dst = value;
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r16_unorm_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
         uint16_t value = *(const uint16_t *)src;
         uint16_t r;
         r = value;
         dst[0] = (float)(r * (1.0f/0xffff)); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
}

static inline void
util_format_r16_unorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = *(const uint16_t *)src;
         uint16_t r;
         r = value;
         dst[0] = (uint8_t)(r >> 8); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 255; /* a */
         src += 2;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r16_unorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = 0;
         value |= (uint16_t)(((uint32_t)src[0]) * 0xffff / 0xff);
         *(uint16_t *)dst = value;
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r16g16_unorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t r;
         uint32_t g;
         r = value >> 16;
         g = (value) & 0xffff;
         dst[0] = (float)(r * (1.0f/0xffff)); /* r */
         dst[1] = (float)(g * (1.0f/0xffff)); /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t r;
         uint32_t g;
         r = (value) & 0xffff;
         g = value >> 16;
         dst[0] = (float)(r * (1.0f/0xffff)); /* r */
         dst[1] = (float)(g * (1.0f/0xffff)); /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r16g16_unorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint16_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0xffff)) << 16;
         value |= ((uint16_t)util_iround(CLAMP(src[1], 0.0f, 1.0f) * 0xffff)) & 0xffff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= ((uint16_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0xffff)) & 0xffff;
         value |= (uint32_t)((uint16_t)util_iround(CLAMP(src[1], 0.0f, 1.0f) * 0xffff)) << 16;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r16g16_unorm_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t r;
         uint32_t g;
         r = value >> 16;
         g = (value) & 0xffff;
         dst[0] = (float)(r * (1.0f/0xffff)); /* r */
         dst[1] = (float)(g * (1.0f/0xffff)); /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t r;
         uint32_t g;
         r = (value) & 0xffff;
         g = value >> 16;
         dst[0] = (float)(r * (1.0f/0xffff)); /* r */
         dst[1] = (float)(g * (1.0f/0xffff)); /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#endif
}

static inline void
util_format_r16g16_unorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t r;
         uint32_t g;
         r = value >> 16;
         g = (value) & 0xffff;
         dst[0] = (uint8_t)(r >> 8); /* r */
         dst[1] = (uint8_t)(g >> 8); /* g */
         dst[2] = 0; /* b */
         dst[3] = 255; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t r;
         uint32_t g;
         r = (value) & 0xffff;
         g = value >> 16;
         dst[0] = (uint8_t)(r >> 8); /* r */
         dst[1] = (uint8_t)(g >> 8); /* g */
         dst[2] = 0; /* b */
         dst[3] = 255; /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r16g16_unorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint16_t)(((uint32_t)src[0]) * 0xffff / 0xff)) << 16;
         value |= ((uint16_t)(((uint32_t)src[1]) * 0xffff / 0xff)) & 0xffff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= ((uint16_t)(((uint32_t)src[0]) * 0xffff / 0xff)) & 0xffff;
         value |= (uint32_t)((uint16_t)(((uint32_t)src[1]) * 0xffff / 0xff)) << 16;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r16g16b16_unorm {
#if UTIL_ARCH_BIG_ENDIAN
   uint16_t r;
   uint16_t g;
   uint16_t b;
#else
   uint16_t r;
   uint16_t g;
   uint16_t b;
#endif
};

static inline void
util_format_r16g16b16_unorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16_unorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0f/0xffff)); /* r */
         dst[1] = (float)(pixel.g * (1.0f/0xffff)); /* g */
         dst[2] = (float)(pixel.b * (1.0f/0xffff)); /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r16g16b16_unorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0f/0xffff)); /* r */
         dst[1] = (float)(pixel.g * (1.0f/0xffff)); /* g */
         dst[2] = (float)(pixel.b * (1.0f/0xffff)); /* b */
         dst[3] = 1; /* a */
#endif
         src += 6;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r16g16b16_unorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16_unorm pixel;
         pixel.r = (uint16_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0xffff);
         pixel.g = (uint16_t)util_iround(CLAMP(src[1], 0.0f, 1.0f) * 0xffff);
         pixel.b = (uint16_t)util_iround(CLAMP(src[2], 0.0f, 1.0f) * 0xffff);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r16g16b16_unorm pixel;
         pixel.r = (uint16_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0xffff);
         pixel.g = (uint16_t)util_iround(CLAMP(src[1], 0.0f, 1.0f) * 0xffff);
         pixel.b = (uint16_t)util_iround(CLAMP(src[2], 0.0f, 1.0f) * 0xffff);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 6;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r16g16b16_unorm_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16_unorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0f/0xffff)); /* r */
         dst[1] = (float)(pixel.g * (1.0f/0xffff)); /* g */
         dst[2] = (float)(pixel.b * (1.0f/0xffff)); /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r16g16b16_unorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0f/0xffff)); /* r */
         dst[1] = (float)(pixel.g * (1.0f/0xffff)); /* g */
         dst[2] = (float)(pixel.b * (1.0f/0xffff)); /* b */
         dst[3] = 1; /* a */
#endif
}

static inline void
util_format_r16g16b16_unorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16_unorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)(pixel.r >> 8); /* r */
         dst[1] = (uint8_t)(pixel.g >> 8); /* g */
         dst[2] = (uint8_t)(pixel.b >> 8); /* b */
         dst[3] = 255; /* a */
#else
         struct util_format_r16g16b16_unorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)(pixel.r >> 8); /* r */
         dst[1] = (uint8_t)(pixel.g >> 8); /* g */
         dst[2] = (uint8_t)(pixel.b >> 8); /* b */
         dst[3] = 255; /* a */
#endif
         src += 6;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r16g16b16_unorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16_unorm pixel;
         pixel.r = (uint16_t)(((uint32_t)src[0]) * 0xffff / 0xff);
         pixel.g = (uint16_t)(((uint32_t)src[1]) * 0xffff / 0xff);
         pixel.b = (uint16_t)(((uint32_t)src[2]) * 0xffff / 0xff);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r16g16b16_unorm pixel;
         pixel.r = (uint16_t)(((uint32_t)src[0]) * 0xffff / 0xff);
         pixel.g = (uint16_t)(((uint32_t)src[1]) * 0xffff / 0xff);
         pixel.b = (uint16_t)(((uint32_t)src[2]) * 0xffff / 0xff);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 6;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r16g16b16a16_unorm {
#if UTIL_ARCH_BIG_ENDIAN
   uint16_t r;
   uint16_t g;
   uint16_t b;
   uint16_t a;
#else
   uint16_t r;
   uint16_t g;
   uint16_t b;
   uint16_t a;
#endif
};

static inline void
util_format_r16g16b16a16_unorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16a16_unorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0f/0xffff)); /* r */
         dst[1] = (float)(pixel.g * (1.0f/0xffff)); /* g */
         dst[2] = (float)(pixel.b * (1.0f/0xffff)); /* b */
         dst[3] = (float)(pixel.a * (1.0f/0xffff)); /* a */
#else
         struct util_format_r16g16b16a16_unorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0f/0xffff)); /* r */
         dst[1] = (float)(pixel.g * (1.0f/0xffff)); /* g */
         dst[2] = (float)(pixel.b * (1.0f/0xffff)); /* b */
         dst[3] = (float)(pixel.a * (1.0f/0xffff)); /* a */
#endif
         src += 8;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r16g16b16a16_unorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16a16_unorm pixel;
         pixel.r = (uint16_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0xffff);
         pixel.g = (uint16_t)util_iround(CLAMP(src[1], 0.0f, 1.0f) * 0xffff);
         pixel.b = (uint16_t)util_iround(CLAMP(src[2], 0.0f, 1.0f) * 0xffff);
         pixel.a = (uint16_t)util_iround(CLAMP(src[3], 0.0f, 1.0f) * 0xffff);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r16g16b16a16_unorm pixel;
         pixel.r = (uint16_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0xffff);
         pixel.g = (uint16_t)util_iround(CLAMP(src[1], 0.0f, 1.0f) * 0xffff);
         pixel.b = (uint16_t)util_iround(CLAMP(src[2], 0.0f, 1.0f) * 0xffff);
         pixel.a = (uint16_t)util_iround(CLAMP(src[3], 0.0f, 1.0f) * 0xffff);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 8;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r16g16b16a16_unorm_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16a16_unorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0f/0xffff)); /* r */
         dst[1] = (float)(pixel.g * (1.0f/0xffff)); /* g */
         dst[2] = (float)(pixel.b * (1.0f/0xffff)); /* b */
         dst[3] = (float)(pixel.a * (1.0f/0xffff)); /* a */
#else
         struct util_format_r16g16b16a16_unorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0f/0xffff)); /* r */
         dst[1] = (float)(pixel.g * (1.0f/0xffff)); /* g */
         dst[2] = (float)(pixel.b * (1.0f/0xffff)); /* b */
         dst[3] = (float)(pixel.a * (1.0f/0xffff)); /* a */
#endif
}

static inline void
util_format_r16g16b16a16_unorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16a16_unorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)(pixel.r >> 8); /* r */
         dst[1] = (uint8_t)(pixel.g >> 8); /* g */
         dst[2] = (uint8_t)(pixel.b >> 8); /* b */
         dst[3] = (uint8_t)(pixel.a >> 8); /* a */
#else
         struct util_format_r16g16b16a16_unorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)(pixel.r >> 8); /* r */
         dst[1] = (uint8_t)(pixel.g >> 8); /* g */
         dst[2] = (uint8_t)(pixel.b >> 8); /* b */
         dst[3] = (uint8_t)(pixel.a >> 8); /* a */
#endif
         src += 8;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r16g16b16a16_unorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16a16_unorm pixel;
         pixel.r = (uint16_t)(((uint32_t)src[0]) * 0xffff / 0xff);
         pixel.g = (uint16_t)(((uint32_t)src[1]) * 0xffff / 0xff);
         pixel.b = (uint16_t)(((uint32_t)src[2]) * 0xffff / 0xff);
         pixel.a = (uint16_t)(((uint32_t)src[3]) * 0xffff / 0xff);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r16g16b16a16_unorm pixel;
         pixel.r = (uint16_t)(((uint32_t)src[0]) * 0xffff / 0xff);
         pixel.g = (uint16_t)(((uint32_t)src[1]) * 0xffff / 0xff);
         pixel.b = (uint16_t)(((uint32_t)src[2]) * 0xffff / 0xff);
         pixel.a = (uint16_t)(((uint32_t)src[3]) * 0xffff / 0xff);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 8;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r16_uscaled_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = *(const uint16_t *)src;
         uint16_t r;
         r = value;
         dst[0] = (float)r; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
         src += 2;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r16_uscaled_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = 0;
         value |= (uint16_t)CLAMP(src[0], 0.0f, 65535.0f);
         *(uint16_t *)dst = value;
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r16_uscaled_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
         uint16_t value = *(const uint16_t *)src;
         uint16_t r;
         r = value;
         dst[0] = (float)r; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
}

static inline void
util_format_r16_uscaled_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = *(const uint16_t *)src;
         uint16_t r;
         r = value;
         dst[0] = (uint8_t)(((uint32_t)MIN2(r, 1)) * 0xff / 0x1); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 255; /* a */
         src += 2;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r16_uscaled_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = 0;
         value |= (uint16_t)(((uint32_t)src[0]) * 0x1 / 0xff);
         *(uint16_t *)dst = value;
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r16g16_uscaled_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t r;
         uint32_t g;
         r = value >> 16;
         g = (value) & 0xffff;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t r;
         uint32_t g;
         r = (value) & 0xffff;
         g = value >> 16;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r16g16_uscaled_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint16_t)CLAMP(src[0], 0.0f, 65535.0f)) << 16;
         value |= ((uint16_t)CLAMP(src[1], 0.0f, 65535.0f)) & 0xffff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= ((uint16_t)CLAMP(src[0], 0.0f, 65535.0f)) & 0xffff;
         value |= (uint32_t)((uint16_t)CLAMP(src[1], 0.0f, 65535.0f)) << 16;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r16g16_uscaled_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t r;
         uint32_t g;
         r = value >> 16;
         g = (value) & 0xffff;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t r;
         uint32_t g;
         r = (value) & 0xffff;
         g = value >> 16;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#endif
}

static inline void
util_format_r16g16_uscaled_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t r;
         uint32_t g;
         r = value >> 16;
         g = (value) & 0xffff;
         dst[0] = (uint8_t)(((uint32_t)MIN2(r, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint32_t)MIN2(g, 1)) * 0xff / 0x1); /* g */
         dst[2] = 0; /* b */
         dst[3] = 255; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t r;
         uint32_t g;
         r = (value) & 0xffff;
         g = value >> 16;
         dst[0] = (uint8_t)(((uint32_t)MIN2(r, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint32_t)MIN2(g, 1)) * 0xff / 0x1); /* g */
         dst[2] = 0; /* b */
         dst[3] = 255; /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r16g16_uscaled_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint16_t)(((uint32_t)src[0]) * 0x1 / 0xff)) << 16;
         value |= ((uint16_t)(((uint32_t)src[1]) * 0x1 / 0xff)) & 0xffff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= ((uint16_t)(((uint32_t)src[0]) * 0x1 / 0xff)) & 0xffff;
         value |= (uint32_t)((uint16_t)(((uint32_t)src[1]) * 0x1 / 0xff)) << 16;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r16g16b16_uscaled {
#if UTIL_ARCH_BIG_ENDIAN
   uint16_t r;
   uint16_t g;
   uint16_t b;
#else
   uint16_t r;
   uint16_t g;
   uint16_t b;
#endif
};

static inline void
util_format_r16g16b16_uscaled_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16_uscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r16g16b16_uscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = 1; /* a */
#endif
         src += 6;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r16g16b16_uscaled_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16_uscaled pixel;
         pixel.r = (uint16_t)CLAMP(src[0], 0.0f, 65535.0f);
         pixel.g = (uint16_t)CLAMP(src[1], 0.0f, 65535.0f);
         pixel.b = (uint16_t)CLAMP(src[2], 0.0f, 65535.0f);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r16g16b16_uscaled pixel;
         pixel.r = (uint16_t)CLAMP(src[0], 0.0f, 65535.0f);
         pixel.g = (uint16_t)CLAMP(src[1], 0.0f, 65535.0f);
         pixel.b = (uint16_t)CLAMP(src[2], 0.0f, 65535.0f);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 6;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r16g16b16_uscaled_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16_uscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r16g16b16_uscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = 1; /* a */
#endif
}

static inline void
util_format_r16g16b16_uscaled_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16_uscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)(((uint32_t)MIN2(pixel.r, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint32_t)MIN2(pixel.g, 1)) * 0xff / 0x1); /* g */
         dst[2] = (uint8_t)(((uint32_t)MIN2(pixel.b, 1)) * 0xff / 0x1); /* b */
         dst[3] = 255; /* a */
#else
         struct util_format_r16g16b16_uscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)(((uint32_t)MIN2(pixel.r, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint32_t)MIN2(pixel.g, 1)) * 0xff / 0x1); /* g */
         dst[2] = (uint8_t)(((uint32_t)MIN2(pixel.b, 1)) * 0xff / 0x1); /* b */
         dst[3] = 255; /* a */
#endif
         src += 6;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r16g16b16_uscaled_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16_uscaled pixel;
         pixel.r = (uint16_t)(((uint32_t)src[0]) * 0x1 / 0xff);
         pixel.g = (uint16_t)(((uint32_t)src[1]) * 0x1 / 0xff);
         pixel.b = (uint16_t)(((uint32_t)src[2]) * 0x1 / 0xff);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r16g16b16_uscaled pixel;
         pixel.r = (uint16_t)(((uint32_t)src[0]) * 0x1 / 0xff);
         pixel.g = (uint16_t)(((uint32_t)src[1]) * 0x1 / 0xff);
         pixel.b = (uint16_t)(((uint32_t)src[2]) * 0x1 / 0xff);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 6;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r16g16b16a16_uscaled {
#if UTIL_ARCH_BIG_ENDIAN
   uint16_t r;
   uint16_t g;
   uint16_t b;
   uint16_t a;
#else
   uint16_t r;
   uint16_t g;
   uint16_t b;
   uint16_t a;
#endif
};

static inline void
util_format_r16g16b16a16_uscaled_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16a16_uscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = (float)pixel.a; /* a */
#else
         struct util_format_r16g16b16a16_uscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = (float)pixel.a; /* a */
#endif
         src += 8;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r16g16b16a16_uscaled_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16a16_uscaled pixel;
         pixel.r = (uint16_t)CLAMP(src[0], 0.0f, 65535.0f);
         pixel.g = (uint16_t)CLAMP(src[1], 0.0f, 65535.0f);
         pixel.b = (uint16_t)CLAMP(src[2], 0.0f, 65535.0f);
         pixel.a = (uint16_t)CLAMP(src[3], 0.0f, 65535.0f);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r16g16b16a16_uscaled pixel;
         pixel.r = (uint16_t)CLAMP(src[0], 0.0f, 65535.0f);
         pixel.g = (uint16_t)CLAMP(src[1], 0.0f, 65535.0f);
         pixel.b = (uint16_t)CLAMP(src[2], 0.0f, 65535.0f);
         pixel.a = (uint16_t)CLAMP(src[3], 0.0f, 65535.0f);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 8;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r16g16b16a16_uscaled_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16a16_uscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = (float)pixel.a; /* a */
#else
         struct util_format_r16g16b16a16_uscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = (float)pixel.a; /* a */
#endif
}

static inline void
util_format_r16g16b16a16_uscaled_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16a16_uscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)(((uint32_t)MIN2(pixel.r, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint32_t)MIN2(pixel.g, 1)) * 0xff / 0x1); /* g */
         dst[2] = (uint8_t)(((uint32_t)MIN2(pixel.b, 1)) * 0xff / 0x1); /* b */
         dst[3] = (uint8_t)(((uint32_t)MIN2(pixel.a, 1)) * 0xff / 0x1); /* a */
#else
         struct util_format_r16g16b16a16_uscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)(((uint32_t)MIN2(pixel.r, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint32_t)MIN2(pixel.g, 1)) * 0xff / 0x1); /* g */
         dst[2] = (uint8_t)(((uint32_t)MIN2(pixel.b, 1)) * 0xff / 0x1); /* b */
         dst[3] = (uint8_t)(((uint32_t)MIN2(pixel.a, 1)) * 0xff / 0x1); /* a */
#endif
         src += 8;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r16g16b16a16_uscaled_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16a16_uscaled pixel;
         pixel.r = (uint16_t)(((uint32_t)src[0]) * 0x1 / 0xff);
         pixel.g = (uint16_t)(((uint32_t)src[1]) * 0x1 / 0xff);
         pixel.b = (uint16_t)(((uint32_t)src[2]) * 0x1 / 0xff);
         pixel.a = (uint16_t)(((uint32_t)src[3]) * 0x1 / 0xff);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r16g16b16a16_uscaled pixel;
         pixel.r = (uint16_t)(((uint32_t)src[0]) * 0x1 / 0xff);
         pixel.g = (uint16_t)(((uint32_t)src[1]) * 0x1 / 0xff);
         pixel.b = (uint16_t)(((uint32_t)src[2]) * 0x1 / 0xff);
         pixel.a = (uint16_t)(((uint32_t)src[3]) * 0x1 / 0xff);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 8;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r16_snorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = *(const uint16_t *)src;
         int16_t r;
         r = (int16_t)(value) ;
         dst[0] = (float)(r * (1.0f/0x7fff)); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
         src += 2;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r16_snorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = 0;
         value |= (uint16_t)((int16_t)util_iround(CLAMP(src[0], -1.0f, 1.0f) * 0x7fff)) ;
         *(uint16_t *)dst = value;
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r16_snorm_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
         uint16_t value = *(const uint16_t *)src;
         int16_t r;
         r = (int16_t)(value) ;
         dst[0] = (float)(r * (1.0f/0x7fff)); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
}

static inline void
util_format_r16_snorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = *(const uint16_t *)src;
         int16_t r;
         r = (int16_t)(value) ;
         dst[0] = (uint8_t)(MAX2(r, 0) >> 7); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 255; /* a */
         src += 2;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r16_snorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = 0;
         value |= (uint16_t)((int16_t)(((uint32_t)src[0]) * 0x7fff / 0xff)) ;
         *(uint16_t *)dst = value;
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r16g16_snorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t r;
         int32_t g;
         r = ((int32_t)(value) ) >> 16;
         g = ((int32_t)(value << 16) ) >> 16;
         dst[0] = (float)(r * (1.0f/0x7fff)); /* r */
         dst[1] = (float)(g * (1.0f/0x7fff)); /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t r;
         int32_t g;
         r = ((int32_t)(value << 16) ) >> 16;
         g = ((int32_t)(value) ) >> 16;
         dst[0] = (float)(r * (1.0f/0x7fff)); /* r */
         dst[1] = (float)(g * (1.0f/0x7fff)); /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r16g16_snorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)((int16_t)util_iround(CLAMP(src[0], -1.0f, 1.0f) * 0x7fff)) << 16) ;
         value |= (uint32_t)(((int16_t)util_iround(CLAMP(src[1], -1.0f, 1.0f) * 0x7fff)) & 0xffff) ;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (uint32_t)(((int16_t)util_iround(CLAMP(src[0], -1.0f, 1.0f) * 0x7fff)) & 0xffff) ;
         value |= (uint32_t)((uint32_t)((int16_t)util_iround(CLAMP(src[1], -1.0f, 1.0f) * 0x7fff)) << 16) ;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r16g16_snorm_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t r;
         int32_t g;
         r = ((int32_t)(value) ) >> 16;
         g = ((int32_t)(value << 16) ) >> 16;
         dst[0] = (float)(r * (1.0f/0x7fff)); /* r */
         dst[1] = (float)(g * (1.0f/0x7fff)); /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t r;
         int32_t g;
         r = ((int32_t)(value << 16) ) >> 16;
         g = ((int32_t)(value) ) >> 16;
         dst[0] = (float)(r * (1.0f/0x7fff)); /* r */
         dst[1] = (float)(g * (1.0f/0x7fff)); /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#endif
}

static inline void
util_format_r16g16_snorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t r;
         int32_t g;
         r = ((int32_t)(value) ) >> 16;
         g = ((int32_t)(value << 16) ) >> 16;
         dst[0] = (uint8_t)(MAX2(r, 0) >> 7); /* r */
         dst[1] = (uint8_t)(MAX2(g, 0) >> 7); /* g */
         dst[2] = 0; /* b */
         dst[3] = 255; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t r;
         int32_t g;
         r = ((int32_t)(value << 16) ) >> 16;
         g = ((int32_t)(value) ) >> 16;
         dst[0] = (uint8_t)(MAX2(r, 0) >> 7); /* r */
         dst[1] = (uint8_t)(MAX2(g, 0) >> 7); /* g */
         dst[2] = 0; /* b */
         dst[3] = 255; /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r16g16_snorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)((int16_t)(((uint32_t)src[0]) * 0x7fff / 0xff)) << 16) ;
         value |= (uint32_t)(((int16_t)(((uint32_t)src[1]) * 0x7fff / 0xff)) & 0xffff) ;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (uint32_t)(((int16_t)(((uint32_t)src[0]) * 0x7fff / 0xff)) & 0xffff) ;
         value |= (uint32_t)((uint32_t)((int16_t)(((uint32_t)src[1]) * 0x7fff / 0xff)) << 16) ;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r16g16b16_snorm {
#if UTIL_ARCH_BIG_ENDIAN
   int16_t r;
   int16_t g;
   int16_t b;
#else
   int16_t r;
   int16_t g;
   int16_t b;
#endif
};

static inline void
util_format_r16g16b16_snorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16_snorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0f/0x7fff)); /* r */
         dst[1] = (float)(pixel.g * (1.0f/0x7fff)); /* g */
         dst[2] = (float)(pixel.b * (1.0f/0x7fff)); /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r16g16b16_snorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0f/0x7fff)); /* r */
         dst[1] = (float)(pixel.g * (1.0f/0x7fff)); /* g */
         dst[2] = (float)(pixel.b * (1.0f/0x7fff)); /* b */
         dst[3] = 1; /* a */
#endif
         src += 6;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r16g16b16_snorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16_snorm pixel;
         pixel.r = (int16_t)util_iround(CLAMP(src[0], -1.0f, 1.0f) * 0x7fff);
         pixel.g = (int16_t)util_iround(CLAMP(src[1], -1.0f, 1.0f) * 0x7fff);
         pixel.b = (int16_t)util_iround(CLAMP(src[2], -1.0f, 1.0f) * 0x7fff);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r16g16b16_snorm pixel;
         pixel.r = (int16_t)util_iround(CLAMP(src[0], -1.0f, 1.0f) * 0x7fff);
         pixel.g = (int16_t)util_iround(CLAMP(src[1], -1.0f, 1.0f) * 0x7fff);
         pixel.b = (int16_t)util_iround(CLAMP(src[2], -1.0f, 1.0f) * 0x7fff);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 6;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r16g16b16_snorm_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16_snorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0f/0x7fff)); /* r */
         dst[1] = (float)(pixel.g * (1.0f/0x7fff)); /* g */
         dst[2] = (float)(pixel.b * (1.0f/0x7fff)); /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r16g16b16_snorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0f/0x7fff)); /* r */
         dst[1] = (float)(pixel.g * (1.0f/0x7fff)); /* g */
         dst[2] = (float)(pixel.b * (1.0f/0x7fff)); /* b */
         dst[3] = 1; /* a */
#endif
}

static inline void
util_format_r16g16b16_snorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16_snorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)(MAX2(pixel.r, 0) >> 7); /* r */
         dst[1] = (uint8_t)(MAX2(pixel.g, 0) >> 7); /* g */
         dst[2] = (uint8_t)(MAX2(pixel.b, 0) >> 7); /* b */
         dst[3] = 255; /* a */
#else
         struct util_format_r16g16b16_snorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)(MAX2(pixel.r, 0) >> 7); /* r */
         dst[1] = (uint8_t)(MAX2(pixel.g, 0) >> 7); /* g */
         dst[2] = (uint8_t)(MAX2(pixel.b, 0) >> 7); /* b */
         dst[3] = 255; /* a */
#endif
         src += 6;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r16g16b16_snorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16_snorm pixel;
         pixel.r = (int16_t)(((uint32_t)src[0]) * 0x7fff / 0xff);
         pixel.g = (int16_t)(((uint32_t)src[1]) * 0x7fff / 0xff);
         pixel.b = (int16_t)(((uint32_t)src[2]) * 0x7fff / 0xff);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r16g16b16_snorm pixel;
         pixel.r = (int16_t)(((uint32_t)src[0]) * 0x7fff / 0xff);
         pixel.g = (int16_t)(((uint32_t)src[1]) * 0x7fff / 0xff);
         pixel.b = (int16_t)(((uint32_t)src[2]) * 0x7fff / 0xff);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 6;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r16g16b16a16_snorm {
#if UTIL_ARCH_BIG_ENDIAN
   int16_t r;
   int16_t g;
   int16_t b;
   int16_t a;
#else
   int16_t r;
   int16_t g;
   int16_t b;
   int16_t a;
#endif
};

static inline void
util_format_r16g16b16a16_snorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16a16_snorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0f/0x7fff)); /* r */
         dst[1] = (float)(pixel.g * (1.0f/0x7fff)); /* g */
         dst[2] = (float)(pixel.b * (1.0f/0x7fff)); /* b */
         dst[3] = (float)(pixel.a * (1.0f/0x7fff)); /* a */
#else
         struct util_format_r16g16b16a16_snorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0f/0x7fff)); /* r */
         dst[1] = (float)(pixel.g * (1.0f/0x7fff)); /* g */
         dst[2] = (float)(pixel.b * (1.0f/0x7fff)); /* b */
         dst[3] = (float)(pixel.a * (1.0f/0x7fff)); /* a */
#endif
         src += 8;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r16g16b16a16_snorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16a16_snorm pixel;
         pixel.r = (int16_t)util_iround(CLAMP(src[0], -1.0f, 1.0f) * 0x7fff);
         pixel.g = (int16_t)util_iround(CLAMP(src[1], -1.0f, 1.0f) * 0x7fff);
         pixel.b = (int16_t)util_iround(CLAMP(src[2], -1.0f, 1.0f) * 0x7fff);
         pixel.a = (int16_t)util_iround(CLAMP(src[3], -1.0f, 1.0f) * 0x7fff);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r16g16b16a16_snorm pixel;
         pixel.r = (int16_t)util_iround(CLAMP(src[0], -1.0f, 1.0f) * 0x7fff);
         pixel.g = (int16_t)util_iround(CLAMP(src[1], -1.0f, 1.0f) * 0x7fff);
         pixel.b = (int16_t)util_iround(CLAMP(src[2], -1.0f, 1.0f) * 0x7fff);
         pixel.a = (int16_t)util_iround(CLAMP(src[3], -1.0f, 1.0f) * 0x7fff);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 8;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r16g16b16a16_snorm_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16a16_snorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0f/0x7fff)); /* r */
         dst[1] = (float)(pixel.g * (1.0f/0x7fff)); /* g */
         dst[2] = (float)(pixel.b * (1.0f/0x7fff)); /* b */
         dst[3] = (float)(pixel.a * (1.0f/0x7fff)); /* a */
#else
         struct util_format_r16g16b16a16_snorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0f/0x7fff)); /* r */
         dst[1] = (float)(pixel.g * (1.0f/0x7fff)); /* g */
         dst[2] = (float)(pixel.b * (1.0f/0x7fff)); /* b */
         dst[3] = (float)(pixel.a * (1.0f/0x7fff)); /* a */
#endif
}

static inline void
util_format_r16g16b16a16_snorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16a16_snorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)(MAX2(pixel.r, 0) >> 7); /* r */
         dst[1] = (uint8_t)(MAX2(pixel.g, 0) >> 7); /* g */
         dst[2] = (uint8_t)(MAX2(pixel.b, 0) >> 7); /* b */
         dst[3] = (uint8_t)(MAX2(pixel.a, 0) >> 7); /* a */
#else
         struct util_format_r16g16b16a16_snorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)(MAX2(pixel.r, 0) >> 7); /* r */
         dst[1] = (uint8_t)(MAX2(pixel.g, 0) >> 7); /* g */
         dst[2] = (uint8_t)(MAX2(pixel.b, 0) >> 7); /* b */
         dst[3] = (uint8_t)(MAX2(pixel.a, 0) >> 7); /* a */
#endif
         src += 8;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r16g16b16a16_snorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16a16_snorm pixel;
         pixel.r = (int16_t)(((uint32_t)src[0]) * 0x7fff / 0xff);
         pixel.g = (int16_t)(((uint32_t)src[1]) * 0x7fff / 0xff);
         pixel.b = (int16_t)(((uint32_t)src[2]) * 0x7fff / 0xff);
         pixel.a = (int16_t)(((uint32_t)src[3]) * 0x7fff / 0xff);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r16g16b16a16_snorm pixel;
         pixel.r = (int16_t)(((uint32_t)src[0]) * 0x7fff / 0xff);
         pixel.g = (int16_t)(((uint32_t)src[1]) * 0x7fff / 0xff);
         pixel.b = (int16_t)(((uint32_t)src[2]) * 0x7fff / 0xff);
         pixel.a = (int16_t)(((uint32_t)src[3]) * 0x7fff / 0xff);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 8;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r16_sscaled_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = *(const uint16_t *)src;
         int16_t r;
         r = (int16_t)(value) ;
         dst[0] = (float)r; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
         src += 2;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r16_sscaled_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = 0;
         value |= (uint16_t)((int16_t)CLAMP(src[0], -32768.0f, 32767.0f)) ;
         *(uint16_t *)dst = value;
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r16_sscaled_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
         uint16_t value = *(const uint16_t *)src;
         int16_t r;
         r = (int16_t)(value) ;
         dst[0] = (float)r; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
}

static inline void
util_format_r16_sscaled_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = *(const uint16_t *)src;
         int16_t r;
         r = (int16_t)(value) ;
         dst[0] = (uint8_t)(((uint32_t)CLAMP(r, 0, 1)) * 0xff / 0x1); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 255; /* a */
         src += 2;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r16_sscaled_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = 0;
         value |= (uint16_t)((int16_t)(((uint32_t)src[0]) * 0x1 / 0xff)) ;
         *(uint16_t *)dst = value;
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r16g16_sscaled_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t r;
         int32_t g;
         r = ((int32_t)(value) ) >> 16;
         g = ((int32_t)(value << 16) ) >> 16;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t r;
         int32_t g;
         r = ((int32_t)(value << 16) ) >> 16;
         g = ((int32_t)(value) ) >> 16;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r16g16_sscaled_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)((int16_t)CLAMP(src[0], -32768.0f, 32767.0f)) << 16) ;
         value |= (uint32_t)(((int16_t)CLAMP(src[1], -32768.0f, 32767.0f)) & 0xffff) ;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (uint32_t)(((int16_t)CLAMP(src[0], -32768.0f, 32767.0f)) & 0xffff) ;
         value |= (uint32_t)((uint32_t)((int16_t)CLAMP(src[1], -32768.0f, 32767.0f)) << 16) ;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r16g16_sscaled_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t r;
         int32_t g;
         r = ((int32_t)(value) ) >> 16;
         g = ((int32_t)(value << 16) ) >> 16;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t r;
         int32_t g;
         r = ((int32_t)(value << 16) ) >> 16;
         g = ((int32_t)(value) ) >> 16;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#endif
}

static inline void
util_format_r16g16_sscaled_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t r;
         int32_t g;
         r = ((int32_t)(value) ) >> 16;
         g = ((int32_t)(value << 16) ) >> 16;
         dst[0] = (uint8_t)(((uint32_t)CLAMP(r, 0, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint32_t)CLAMP(g, 0, 1)) * 0xff / 0x1); /* g */
         dst[2] = 0; /* b */
         dst[3] = 255; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t r;
         int32_t g;
         r = ((int32_t)(value << 16) ) >> 16;
         g = ((int32_t)(value) ) >> 16;
         dst[0] = (uint8_t)(((uint32_t)CLAMP(r, 0, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint32_t)CLAMP(g, 0, 1)) * 0xff / 0x1); /* g */
         dst[2] = 0; /* b */
         dst[3] = 255; /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r16g16_sscaled_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)((int16_t)(((uint32_t)src[0]) * 0x1 / 0xff)) << 16) ;
         value |= (uint32_t)(((int16_t)(((uint32_t)src[1]) * 0x1 / 0xff)) & 0xffff) ;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (uint32_t)(((int16_t)(((uint32_t)src[0]) * 0x1 / 0xff)) & 0xffff) ;
         value |= (uint32_t)((uint32_t)((int16_t)(((uint32_t)src[1]) * 0x1 / 0xff)) << 16) ;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r16g16b16_sscaled {
#if UTIL_ARCH_BIG_ENDIAN
   int16_t r;
   int16_t g;
   int16_t b;
#else
   int16_t r;
   int16_t g;
   int16_t b;
#endif
};

static inline void
util_format_r16g16b16_sscaled_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16_sscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r16g16b16_sscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = 1; /* a */
#endif
         src += 6;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r16g16b16_sscaled_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16_sscaled pixel;
         pixel.r = (int16_t)CLAMP(src[0], -32768.0f, 32767.0f);
         pixel.g = (int16_t)CLAMP(src[1], -32768.0f, 32767.0f);
         pixel.b = (int16_t)CLAMP(src[2], -32768.0f, 32767.0f);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r16g16b16_sscaled pixel;
         pixel.r = (int16_t)CLAMP(src[0], -32768.0f, 32767.0f);
         pixel.g = (int16_t)CLAMP(src[1], -32768.0f, 32767.0f);
         pixel.b = (int16_t)CLAMP(src[2], -32768.0f, 32767.0f);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 6;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r16g16b16_sscaled_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16_sscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r16g16b16_sscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = 1; /* a */
#endif
}

static inline void
util_format_r16g16b16_sscaled_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16_sscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)(((uint32_t)CLAMP(pixel.r, 0, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint32_t)CLAMP(pixel.g, 0, 1)) * 0xff / 0x1); /* g */
         dst[2] = (uint8_t)(((uint32_t)CLAMP(pixel.b, 0, 1)) * 0xff / 0x1); /* b */
         dst[3] = 255; /* a */
#else
         struct util_format_r16g16b16_sscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)(((uint32_t)CLAMP(pixel.r, 0, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint32_t)CLAMP(pixel.g, 0, 1)) * 0xff / 0x1); /* g */
         dst[2] = (uint8_t)(((uint32_t)CLAMP(pixel.b, 0, 1)) * 0xff / 0x1); /* b */
         dst[3] = 255; /* a */
#endif
         src += 6;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r16g16b16_sscaled_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16_sscaled pixel;
         pixel.r = (int16_t)(((uint32_t)src[0]) * 0x1 / 0xff);
         pixel.g = (int16_t)(((uint32_t)src[1]) * 0x1 / 0xff);
         pixel.b = (int16_t)(((uint32_t)src[2]) * 0x1 / 0xff);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r16g16b16_sscaled pixel;
         pixel.r = (int16_t)(((uint32_t)src[0]) * 0x1 / 0xff);
         pixel.g = (int16_t)(((uint32_t)src[1]) * 0x1 / 0xff);
         pixel.b = (int16_t)(((uint32_t)src[2]) * 0x1 / 0xff);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 6;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r16g16b16a16_sscaled {
#if UTIL_ARCH_BIG_ENDIAN
   int16_t r;
   int16_t g;
   int16_t b;
   int16_t a;
#else
   int16_t r;
   int16_t g;
   int16_t b;
   int16_t a;
#endif
};

static inline void
util_format_r16g16b16a16_sscaled_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16a16_sscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = (float)pixel.a; /* a */
#else
         struct util_format_r16g16b16a16_sscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = (float)pixel.a; /* a */
#endif
         src += 8;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r16g16b16a16_sscaled_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16a16_sscaled pixel;
         pixel.r = (int16_t)CLAMP(src[0], -32768.0f, 32767.0f);
         pixel.g = (int16_t)CLAMP(src[1], -32768.0f, 32767.0f);
         pixel.b = (int16_t)CLAMP(src[2], -32768.0f, 32767.0f);
         pixel.a = (int16_t)CLAMP(src[3], -32768.0f, 32767.0f);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r16g16b16a16_sscaled pixel;
         pixel.r = (int16_t)CLAMP(src[0], -32768.0f, 32767.0f);
         pixel.g = (int16_t)CLAMP(src[1], -32768.0f, 32767.0f);
         pixel.b = (int16_t)CLAMP(src[2], -32768.0f, 32767.0f);
         pixel.a = (int16_t)CLAMP(src[3], -32768.0f, 32767.0f);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 8;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r16g16b16a16_sscaled_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16a16_sscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = (float)pixel.a; /* a */
#else
         struct util_format_r16g16b16a16_sscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = (float)pixel.a; /* a */
#endif
}

static inline void
util_format_r16g16b16a16_sscaled_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16a16_sscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)(((uint32_t)CLAMP(pixel.r, 0, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint32_t)CLAMP(pixel.g, 0, 1)) * 0xff / 0x1); /* g */
         dst[2] = (uint8_t)(((uint32_t)CLAMP(pixel.b, 0, 1)) * 0xff / 0x1); /* b */
         dst[3] = (uint8_t)(((uint32_t)CLAMP(pixel.a, 0, 1)) * 0xff / 0x1); /* a */
#else
         struct util_format_r16g16b16a16_sscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)(((uint32_t)CLAMP(pixel.r, 0, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint32_t)CLAMP(pixel.g, 0, 1)) * 0xff / 0x1); /* g */
         dst[2] = (uint8_t)(((uint32_t)CLAMP(pixel.b, 0, 1)) * 0xff / 0x1); /* b */
         dst[3] = (uint8_t)(((uint32_t)CLAMP(pixel.a, 0, 1)) * 0xff / 0x1); /* a */
#endif
         src += 8;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r16g16b16a16_sscaled_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16a16_sscaled pixel;
         pixel.r = (int16_t)(((uint32_t)src[0]) * 0x1 / 0xff);
         pixel.g = (int16_t)(((uint32_t)src[1]) * 0x1 / 0xff);
         pixel.b = (int16_t)(((uint32_t)src[2]) * 0x1 / 0xff);
         pixel.a = (int16_t)(((uint32_t)src[3]) * 0x1 / 0xff);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r16g16b16a16_sscaled pixel;
         pixel.r = (int16_t)(((uint32_t)src[0]) * 0x1 / 0xff);
         pixel.g = (int16_t)(((uint32_t)src[1]) * 0x1 / 0xff);
         pixel.b = (int16_t)(((uint32_t)src[2]) * 0x1 / 0xff);
         pixel.a = (int16_t)(((uint32_t)src[3]) * 0x1 / 0xff);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 8;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r8_unorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = *(const uint8_t *)src;
         uint8_t r;
         r = value;
         dst[0] = ubyte_to_float(r); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
         src += 1;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r8_unorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = 0;
         value |= float_to_ubyte(src[0]);
         *(uint8_t *)dst = value;
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r8_unorm_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
         uint8_t value = *(const uint8_t *)src;
         uint8_t r;
         r = value;
         dst[0] = ubyte_to_float(r); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
}

static inline void
util_format_r8_unorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = *(const uint8_t *)src;
         uint8_t r;
         r = value;
         dst[0] = r; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 255; /* a */
         src += 1;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r8_unorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = 0;
         value |= src[0];
         *(uint8_t *)dst = value;
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r8g8_unorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t r;
         uint16_t g;
         r = value >> 8;
         g = (value) & 0xff;
         dst[0] = ubyte_to_float(r); /* r */
         dst[1] = ubyte_to_float(g); /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t r;
         uint16_t g;
         r = (value) & 0xff;
         g = value >> 8;
         dst[0] = ubyte_to_float(r); /* r */
         dst[1] = ubyte_to_float(g); /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#endif
         src += 2;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r8g8_unorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)(float_to_ubyte(src[0])) << 8;
         value |= (float_to_ubyte(src[1])) & 0xff;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= (float_to_ubyte(src[0])) & 0xff;
         value |= (uint32_t)(float_to_ubyte(src[1])) << 8;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r8g8_unorm_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t r;
         uint16_t g;
         r = value >> 8;
         g = (value) & 0xff;
         dst[0] = ubyte_to_float(r); /* r */
         dst[1] = ubyte_to_float(g); /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t r;
         uint16_t g;
         r = (value) & 0xff;
         g = value >> 8;
         dst[0] = ubyte_to_float(r); /* r */
         dst[1] = ubyte_to_float(g); /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#endif
}

static inline void
util_format_r8g8_unorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t r;
         uint16_t g;
         r = value >> 8;
         g = (value) & 0xff;
         dst[0] = r; /* r */
         dst[1] = g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 255; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t r;
         uint16_t g;
         r = (value) & 0xff;
         g = value >> 8;
         dst[0] = r; /* r */
         dst[1] = g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 255; /* a */
#endif
         src += 2;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r8g8_unorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)(src[0]) << 8;
         value |= (src[1]) & 0xff;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= (src[0]) & 0xff;
         value |= (uint32_t)(src[1]) << 8;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r8g8b8_unorm {
#if UTIL_ARCH_BIG_ENDIAN
   uint8_t r;
   uint8_t g;
   uint8_t b;
#else
   uint8_t r;
   uint8_t g;
   uint8_t b;
#endif
};

static inline void
util_format_r8g8b8_unorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r8g8b8_unorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = ubyte_to_float(pixel.r); /* r */
         dst[1] = ubyte_to_float(pixel.g); /* g */
         dst[2] = ubyte_to_float(pixel.b); /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r8g8b8_unorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = ubyte_to_float(pixel.r); /* r */
         dst[1] = ubyte_to_float(pixel.g); /* g */
         dst[2] = ubyte_to_float(pixel.b); /* b */
         dst[3] = 1; /* a */
#endif
         src += 3;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r8g8b8_unorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r8g8b8_unorm pixel;
         pixel.r = float_to_ubyte(src[0]);
         pixel.g = float_to_ubyte(src[1]);
         pixel.b = float_to_ubyte(src[2]);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r8g8b8_unorm pixel;
         pixel.r = float_to_ubyte(src[0]);
         pixel.g = float_to_ubyte(src[1]);
         pixel.b = float_to_ubyte(src[2]);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 3;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r8g8b8_unorm_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r8g8b8_unorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = ubyte_to_float(pixel.r); /* r */
         dst[1] = ubyte_to_float(pixel.g); /* g */
         dst[2] = ubyte_to_float(pixel.b); /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r8g8b8_unorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = ubyte_to_float(pixel.r); /* r */
         dst[1] = ubyte_to_float(pixel.g); /* g */
         dst[2] = ubyte_to_float(pixel.b); /* b */
         dst[3] = 1; /* a */
#endif
}

static inline void
util_format_r8g8b8_unorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r8g8b8_unorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.r; /* r */
         dst[1] = pixel.g; /* g */
         dst[2] = pixel.b; /* b */
         dst[3] = 255; /* a */
#else
         struct util_format_r8g8b8_unorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.r; /* r */
         dst[1] = pixel.g; /* g */
         dst[2] = pixel.b; /* b */
         dst[3] = 255; /* a */
#endif
         src += 3;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r8g8b8_unorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r8g8b8_unorm pixel;
         pixel.r = src[0];
         pixel.g = src[1];
         pixel.b = src[2];
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r8g8b8_unorm pixel;
         pixel.r = src[0];
         pixel.g = src[1];
         pixel.b = src[2];
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 3;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_b8g8r8_unorm {
#if UTIL_ARCH_BIG_ENDIAN
   uint8_t b;
   uint8_t g;
   uint8_t r;
#else
   uint8_t b;
   uint8_t g;
   uint8_t r;
#endif
};

static inline void
util_format_b8g8r8_unorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_b8g8r8_unorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = ubyte_to_float(pixel.r); /* r */
         dst[1] = ubyte_to_float(pixel.g); /* g */
         dst[2] = ubyte_to_float(pixel.b); /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_b8g8r8_unorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = ubyte_to_float(pixel.r); /* r */
         dst[1] = ubyte_to_float(pixel.g); /* g */
         dst[2] = ubyte_to_float(pixel.b); /* b */
         dst[3] = 1; /* a */
#endif
         src += 3;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_b8g8r8_unorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_b8g8r8_unorm pixel;
         pixel.b = float_to_ubyte(src[2]);
         pixel.g = float_to_ubyte(src[1]);
         pixel.r = float_to_ubyte(src[0]);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_b8g8r8_unorm pixel;
         pixel.b = float_to_ubyte(src[2]);
         pixel.g = float_to_ubyte(src[1]);
         pixel.r = float_to_ubyte(src[0]);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 3;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_b8g8r8_unorm_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_b8g8r8_unorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = ubyte_to_float(pixel.r); /* r */
         dst[1] = ubyte_to_float(pixel.g); /* g */
         dst[2] = ubyte_to_float(pixel.b); /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_b8g8r8_unorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = ubyte_to_float(pixel.r); /* r */
         dst[1] = ubyte_to_float(pixel.g); /* g */
         dst[2] = ubyte_to_float(pixel.b); /* b */
         dst[3] = 1; /* a */
#endif
}

static inline void
util_format_b8g8r8_unorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_b8g8r8_unorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.r; /* r */
         dst[1] = pixel.g; /* g */
         dst[2] = pixel.b; /* b */
         dst[3] = 255; /* a */
#else
         struct util_format_b8g8r8_unorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.r; /* r */
         dst[1] = pixel.g; /* g */
         dst[2] = pixel.b; /* b */
         dst[3] = 255; /* a */
#endif
         src += 3;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_b8g8r8_unorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_b8g8r8_unorm pixel;
         pixel.b = src[2];
         pixel.g = src[1];
         pixel.r = src[0];
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_b8g8r8_unorm pixel;
         pixel.b = src[2];
         pixel.g = src[1];
         pixel.r = src[0];
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 3;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r8g8b8a8_unorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t r;
         uint32_t g;
         uint32_t b;
         uint32_t a;
         r = value >> 24;
         g = (value >> 16) & 0xff;
         b = (value >> 8) & 0xff;
         a = (value) & 0xff;
         dst[0] = ubyte_to_float(r); /* r */
         dst[1] = ubyte_to_float(g); /* g */
         dst[2] = ubyte_to_float(b); /* b */
         dst[3] = ubyte_to_float(a); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t r;
         uint32_t g;
         uint32_t b;
         uint32_t a;
         r = (value) & 0xff;
         g = (value >> 8) & 0xff;
         b = (value >> 16) & 0xff;
         a = value >> 24;
         dst[0] = ubyte_to_float(r); /* r */
         dst[1] = ubyte_to_float(g); /* g */
         dst[2] = ubyte_to_float(b); /* b */
         dst[3] = ubyte_to_float(a); /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r8g8b8a8_unorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)(float_to_ubyte(src[0])) << 24;
         value |= (uint32_t)((float_to_ubyte(src[1])) & 0xff) << 16;
         value |= (uint32_t)((float_to_ubyte(src[2])) & 0xff) << 8;
         value |= (float_to_ubyte(src[3])) & 0xff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (float_to_ubyte(src[0])) & 0xff;
         value |= (uint32_t)((float_to_ubyte(src[1])) & 0xff) << 8;
         value |= (uint32_t)((float_to_ubyte(src[2])) & 0xff) << 16;
         value |= (uint32_t)(float_to_ubyte(src[3])) << 24;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r8g8b8a8_unorm_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t r;
         uint32_t g;
         uint32_t b;
         uint32_t a;
         r = value >> 24;
         g = (value >> 16) & 0xff;
         b = (value >> 8) & 0xff;
         a = (value) & 0xff;
         dst[0] = ubyte_to_float(r); /* r */
         dst[1] = ubyte_to_float(g); /* g */
         dst[2] = ubyte_to_float(b); /* b */
         dst[3] = ubyte_to_float(a); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t r;
         uint32_t g;
         uint32_t b;
         uint32_t a;
         r = (value) & 0xff;
         g = (value >> 8) & 0xff;
         b = (value >> 16) & 0xff;
         a = value >> 24;
         dst[0] = ubyte_to_float(r); /* r */
         dst[1] = ubyte_to_float(g); /* g */
         dst[2] = ubyte_to_float(b); /* b */
         dst[3] = ubyte_to_float(a); /* a */
#endif
}

static inline void
util_format_r8g8b8a8_unorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t r;
         uint32_t g;
         uint32_t b;
         uint32_t a;
         r = value >> 24;
         g = (value >> 16) & 0xff;
         b = (value >> 8) & 0xff;
         a = (value) & 0xff;
         dst[0] = r; /* r */
         dst[1] = g; /* g */
         dst[2] = b; /* b */
         dst[3] = a; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t r;
         uint32_t g;
         uint32_t b;
         uint32_t a;
         r = (value) & 0xff;
         g = (value >> 8) & 0xff;
         b = (value >> 16) & 0xff;
         a = value >> 24;
         dst[0] = r; /* r */
         dst[1] = g; /* g */
         dst[2] = b; /* b */
         dst[3] = a; /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r8g8b8a8_unorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)(src[0]) << 24;
         value |= (uint32_t)((src[1]) & 0xff) << 16;
         value |= (uint32_t)((src[2]) & 0xff) << 8;
         value |= (src[3]) & 0xff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (src[0]) & 0xff;
         value |= (uint32_t)((src[1]) & 0xff) << 8;
         value |= (uint32_t)((src[2]) & 0xff) << 16;
         value |= (uint32_t)(src[3]) << 24;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r8_uscaled_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = *(const uint8_t *)src;
         uint8_t r;
         r = value;
         dst[0] = (float)r; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
         src += 1;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r8_uscaled_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = 0;
         value |= (uint8_t)CLAMP(src[0], 0.0f, 255.0f);
         *(uint8_t *)dst = value;
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r8_uscaled_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
         uint8_t value = *(const uint8_t *)src;
         uint8_t r;
         r = value;
         dst[0] = (float)r; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
}

static inline void
util_format_r8_uscaled_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = *(const uint8_t *)src;
         uint8_t r;
         r = value;
         dst[0] = (uint8_t)(((uint32_t)MIN2(r, 1)) * 0xff / 0x1); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 255; /* a */
         src += 1;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r8_uscaled_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = 0;
         value |= (uint8_t)(((uint32_t)src[0]) * 0x1 / 0xff);
         *(uint8_t *)dst = value;
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r8g8_uscaled_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t r;
         uint16_t g;
         r = value >> 8;
         g = (value) & 0xff;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t r;
         uint16_t g;
         r = (value) & 0xff;
         g = value >> 8;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#endif
         src += 2;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r8g8_uscaled_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)((uint8_t)CLAMP(src[0], 0.0f, 255.0f)) << 8;
         value |= ((uint8_t)CLAMP(src[1], 0.0f, 255.0f)) & 0xff;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= ((uint8_t)CLAMP(src[0], 0.0f, 255.0f)) & 0xff;
         value |= (uint32_t)((uint8_t)CLAMP(src[1], 0.0f, 255.0f)) << 8;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r8g8_uscaled_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t r;
         uint16_t g;
         r = value >> 8;
         g = (value) & 0xff;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t r;
         uint16_t g;
         r = (value) & 0xff;
         g = value >> 8;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#endif
}

static inline void
util_format_r8g8_uscaled_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t r;
         uint16_t g;
         r = value >> 8;
         g = (value) & 0xff;
         dst[0] = (uint8_t)(((uint32_t)MIN2(r, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint32_t)MIN2(g, 1)) * 0xff / 0x1); /* g */
         dst[2] = 0; /* b */
         dst[3] = 255; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t r;
         uint16_t g;
         r = (value) & 0xff;
         g = value >> 8;
         dst[0] = (uint8_t)(((uint32_t)MIN2(r, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint32_t)MIN2(g, 1)) * 0xff / 0x1); /* g */
         dst[2] = 0; /* b */
         dst[3] = 255; /* a */
#endif
         src += 2;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r8g8_uscaled_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)((uint8_t)(((uint32_t)src[0]) * 0x1 / 0xff)) << 8;
         value |= ((uint8_t)(((uint32_t)src[1]) * 0x1 / 0xff)) & 0xff;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= ((uint8_t)(((uint32_t)src[0]) * 0x1 / 0xff)) & 0xff;
         value |= (uint32_t)((uint8_t)(((uint32_t)src[1]) * 0x1 / 0xff)) << 8;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r8g8b8_uscaled {
#if UTIL_ARCH_BIG_ENDIAN
   uint8_t r;
   uint8_t g;
   uint8_t b;
#else
   uint8_t r;
   uint8_t g;
   uint8_t b;
#endif
};

static inline void
util_format_r8g8b8_uscaled_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r8g8b8_uscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r8g8b8_uscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = 1; /* a */
#endif
         src += 3;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r8g8b8_uscaled_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r8g8b8_uscaled pixel;
         pixel.r = (uint8_t)CLAMP(src[0], 0.0f, 255.0f);
         pixel.g = (uint8_t)CLAMP(src[1], 0.0f, 255.0f);
         pixel.b = (uint8_t)CLAMP(src[2], 0.0f, 255.0f);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r8g8b8_uscaled pixel;
         pixel.r = (uint8_t)CLAMP(src[0], 0.0f, 255.0f);
         pixel.g = (uint8_t)CLAMP(src[1], 0.0f, 255.0f);
         pixel.b = (uint8_t)CLAMP(src[2], 0.0f, 255.0f);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 3;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r8g8b8_uscaled_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r8g8b8_uscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r8g8b8_uscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = 1; /* a */
#endif
}

static inline void
util_format_r8g8b8_uscaled_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r8g8b8_uscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)(((uint32_t)MIN2(pixel.r, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint32_t)MIN2(pixel.g, 1)) * 0xff / 0x1); /* g */
         dst[2] = (uint8_t)(((uint32_t)MIN2(pixel.b, 1)) * 0xff / 0x1); /* b */
         dst[3] = 255; /* a */
#else
         struct util_format_r8g8b8_uscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)(((uint32_t)MIN2(pixel.r, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint32_t)MIN2(pixel.g, 1)) * 0xff / 0x1); /* g */
         dst[2] = (uint8_t)(((uint32_t)MIN2(pixel.b, 1)) * 0xff / 0x1); /* b */
         dst[3] = 255; /* a */
#endif
         src += 3;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r8g8b8_uscaled_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r8g8b8_uscaled pixel;
         pixel.r = (uint8_t)(((uint32_t)src[0]) * 0x1 / 0xff);
         pixel.g = (uint8_t)(((uint32_t)src[1]) * 0x1 / 0xff);
         pixel.b = (uint8_t)(((uint32_t)src[2]) * 0x1 / 0xff);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r8g8b8_uscaled pixel;
         pixel.r = (uint8_t)(((uint32_t)src[0]) * 0x1 / 0xff);
         pixel.g = (uint8_t)(((uint32_t)src[1]) * 0x1 / 0xff);
         pixel.b = (uint8_t)(((uint32_t)src[2]) * 0x1 / 0xff);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 3;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_b8g8r8_uscaled {
#if UTIL_ARCH_BIG_ENDIAN
   uint8_t b;
   uint8_t g;
   uint8_t r;
#else
   uint8_t b;
   uint8_t g;
   uint8_t r;
#endif
};

static inline void
util_format_b8g8r8_uscaled_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_b8g8r8_uscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_b8g8r8_uscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = 1; /* a */
#endif
         src += 3;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_b8g8r8_uscaled_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_b8g8r8_uscaled pixel;
         pixel.b = (uint8_t)CLAMP(src[2], 0.0f, 255.0f);
         pixel.g = (uint8_t)CLAMP(src[1], 0.0f, 255.0f);
         pixel.r = (uint8_t)CLAMP(src[0], 0.0f, 255.0f);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_b8g8r8_uscaled pixel;
         pixel.b = (uint8_t)CLAMP(src[2], 0.0f, 255.0f);
         pixel.g = (uint8_t)CLAMP(src[1], 0.0f, 255.0f);
         pixel.r = (uint8_t)CLAMP(src[0], 0.0f, 255.0f);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 3;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_b8g8r8_uscaled_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_b8g8r8_uscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_b8g8r8_uscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = 1; /* a */
#endif
}

static inline void
util_format_b8g8r8_uscaled_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_b8g8r8_uscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)(((uint32_t)MIN2(pixel.r, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint32_t)MIN2(pixel.g, 1)) * 0xff / 0x1); /* g */
         dst[2] = (uint8_t)(((uint32_t)MIN2(pixel.b, 1)) * 0xff / 0x1); /* b */
         dst[3] = 255; /* a */
#else
         struct util_format_b8g8r8_uscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)(((uint32_t)MIN2(pixel.r, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint32_t)MIN2(pixel.g, 1)) * 0xff / 0x1); /* g */
         dst[2] = (uint8_t)(((uint32_t)MIN2(pixel.b, 1)) * 0xff / 0x1); /* b */
         dst[3] = 255; /* a */
#endif
         src += 3;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_b8g8r8_uscaled_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_b8g8r8_uscaled pixel;
         pixel.b = (uint8_t)(((uint32_t)src[2]) * 0x1 / 0xff);
         pixel.g = (uint8_t)(((uint32_t)src[1]) * 0x1 / 0xff);
         pixel.r = (uint8_t)(((uint32_t)src[0]) * 0x1 / 0xff);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_b8g8r8_uscaled pixel;
         pixel.b = (uint8_t)(((uint32_t)src[2]) * 0x1 / 0xff);
         pixel.g = (uint8_t)(((uint32_t)src[1]) * 0x1 / 0xff);
         pixel.r = (uint8_t)(((uint32_t)src[0]) * 0x1 / 0xff);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 3;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r8g8b8a8_uscaled_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t r;
         uint32_t g;
         uint32_t b;
         uint32_t a;
         r = value >> 24;
         g = (value >> 16) & 0xff;
         b = (value >> 8) & 0xff;
         a = (value) & 0xff;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = (float)b; /* b */
         dst[3] = (float)a; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t r;
         uint32_t g;
         uint32_t b;
         uint32_t a;
         r = (value) & 0xff;
         g = (value >> 8) & 0xff;
         b = (value >> 16) & 0xff;
         a = value >> 24;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = (float)b; /* b */
         dst[3] = (float)a; /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r8g8b8a8_uscaled_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint8_t)CLAMP(src[0], 0.0f, 255.0f)) << 24;
         value |= (uint32_t)(((uint8_t)CLAMP(src[1], 0.0f, 255.0f)) & 0xff) << 16;
         value |= (uint32_t)(((uint8_t)CLAMP(src[2], 0.0f, 255.0f)) & 0xff) << 8;
         value |= ((uint8_t)CLAMP(src[3], 0.0f, 255.0f)) & 0xff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= ((uint8_t)CLAMP(src[0], 0.0f, 255.0f)) & 0xff;
         value |= (uint32_t)(((uint8_t)CLAMP(src[1], 0.0f, 255.0f)) & 0xff) << 8;
         value |= (uint32_t)(((uint8_t)CLAMP(src[2], 0.0f, 255.0f)) & 0xff) << 16;
         value |= (uint32_t)((uint8_t)CLAMP(src[3], 0.0f, 255.0f)) << 24;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r8g8b8a8_uscaled_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t r;
         uint32_t g;
         uint32_t b;
         uint32_t a;
         r = value >> 24;
         g = (value >> 16) & 0xff;
         b = (value >> 8) & 0xff;
         a = (value) & 0xff;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = (float)b; /* b */
         dst[3] = (float)a; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t r;
         uint32_t g;
         uint32_t b;
         uint32_t a;
         r = (value) & 0xff;
         g = (value >> 8) & 0xff;
         b = (value >> 16) & 0xff;
         a = value >> 24;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = (float)b; /* b */
         dst[3] = (float)a; /* a */
#endif
}

static inline void
util_format_r8g8b8a8_uscaled_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t r;
         uint32_t g;
         uint32_t b;
         uint32_t a;
         r = value >> 24;
         g = (value >> 16) & 0xff;
         b = (value >> 8) & 0xff;
         a = (value) & 0xff;
         dst[0] = (uint8_t)(((uint32_t)MIN2(r, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint32_t)MIN2(g, 1)) * 0xff / 0x1); /* g */
         dst[2] = (uint8_t)(((uint32_t)MIN2(b, 1)) * 0xff / 0x1); /* b */
         dst[3] = (uint8_t)(((uint32_t)MIN2(a, 1)) * 0xff / 0x1); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t r;
         uint32_t g;
         uint32_t b;
         uint32_t a;
         r = (value) & 0xff;
         g = (value >> 8) & 0xff;
         b = (value >> 16) & 0xff;
         a = value >> 24;
         dst[0] = (uint8_t)(((uint32_t)MIN2(r, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint32_t)MIN2(g, 1)) * 0xff / 0x1); /* g */
         dst[2] = (uint8_t)(((uint32_t)MIN2(b, 1)) * 0xff / 0x1); /* b */
         dst[3] = (uint8_t)(((uint32_t)MIN2(a, 1)) * 0xff / 0x1); /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r8g8b8a8_uscaled_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint8_t)(((uint32_t)src[0]) * 0x1 / 0xff)) << 24;
         value |= (uint32_t)(((uint8_t)(((uint32_t)src[1]) * 0x1 / 0xff)) & 0xff) << 16;
         value |= (uint32_t)(((uint8_t)(((uint32_t)src[2]) * 0x1 / 0xff)) & 0xff) << 8;
         value |= ((uint8_t)(((uint32_t)src[3]) * 0x1 / 0xff)) & 0xff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= ((uint8_t)(((uint32_t)src[0]) * 0x1 / 0xff)) & 0xff;
         value |= (uint32_t)(((uint8_t)(((uint32_t)src[1]) * 0x1 / 0xff)) & 0xff) << 8;
         value |= (uint32_t)(((uint8_t)(((uint32_t)src[2]) * 0x1 / 0xff)) & 0xff) << 16;
         value |= (uint32_t)((uint8_t)(((uint32_t)src[3]) * 0x1 / 0xff)) << 24;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_b8g8r8a8_uscaled_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t b;
         uint32_t g;
         uint32_t r;
         uint32_t a;
         b = value >> 24;
         g = (value >> 16) & 0xff;
         r = (value >> 8) & 0xff;
         a = (value) & 0xff;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = (float)b; /* b */
         dst[3] = (float)a; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t b;
         uint32_t g;
         uint32_t r;
         uint32_t a;
         b = (value) & 0xff;
         g = (value >> 8) & 0xff;
         r = (value >> 16) & 0xff;
         a = value >> 24;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = (float)b; /* b */
         dst[3] = (float)a; /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_b8g8r8a8_uscaled_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint8_t)CLAMP(src[2], 0.0f, 255.0f)) << 24;
         value |= (uint32_t)(((uint8_t)CLAMP(src[1], 0.0f, 255.0f)) & 0xff) << 16;
         value |= (uint32_t)(((uint8_t)CLAMP(src[0], 0.0f, 255.0f)) & 0xff) << 8;
         value |= ((uint8_t)CLAMP(src[3], 0.0f, 255.0f)) & 0xff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= ((uint8_t)CLAMP(src[2], 0.0f, 255.0f)) & 0xff;
         value |= (uint32_t)(((uint8_t)CLAMP(src[1], 0.0f, 255.0f)) & 0xff) << 8;
         value |= (uint32_t)(((uint8_t)CLAMP(src[0], 0.0f, 255.0f)) & 0xff) << 16;
         value |= (uint32_t)((uint8_t)CLAMP(src[3], 0.0f, 255.0f)) << 24;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_b8g8r8a8_uscaled_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t b;
         uint32_t g;
         uint32_t r;
         uint32_t a;
         b = value >> 24;
         g = (value >> 16) & 0xff;
         r = (value >> 8) & 0xff;
         a = (value) & 0xff;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = (float)b; /* b */
         dst[3] = (float)a; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t b;
         uint32_t g;
         uint32_t r;
         uint32_t a;
         b = (value) & 0xff;
         g = (value >> 8) & 0xff;
         r = (value >> 16) & 0xff;
         a = value >> 24;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = (float)b; /* b */
         dst[3] = (float)a; /* a */
#endif
}

static inline void
util_format_b8g8r8a8_uscaled_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t b;
         uint32_t g;
         uint32_t r;
         uint32_t a;
         b = value >> 24;
         g = (value >> 16) & 0xff;
         r = (value >> 8) & 0xff;
         a = (value) & 0xff;
         dst[0] = (uint8_t)(((uint32_t)MIN2(r, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint32_t)MIN2(g, 1)) * 0xff / 0x1); /* g */
         dst[2] = (uint8_t)(((uint32_t)MIN2(b, 1)) * 0xff / 0x1); /* b */
         dst[3] = (uint8_t)(((uint32_t)MIN2(a, 1)) * 0xff / 0x1); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t b;
         uint32_t g;
         uint32_t r;
         uint32_t a;
         b = (value) & 0xff;
         g = (value >> 8) & 0xff;
         r = (value >> 16) & 0xff;
         a = value >> 24;
         dst[0] = (uint8_t)(((uint32_t)MIN2(r, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint32_t)MIN2(g, 1)) * 0xff / 0x1); /* g */
         dst[2] = (uint8_t)(((uint32_t)MIN2(b, 1)) * 0xff / 0x1); /* b */
         dst[3] = (uint8_t)(((uint32_t)MIN2(a, 1)) * 0xff / 0x1); /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_b8g8r8a8_uscaled_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint8_t)(((uint32_t)src[2]) * 0x1 / 0xff)) << 24;
         value |= (uint32_t)(((uint8_t)(((uint32_t)src[1]) * 0x1 / 0xff)) & 0xff) << 16;
         value |= (uint32_t)(((uint8_t)(((uint32_t)src[0]) * 0x1 / 0xff)) & 0xff) << 8;
         value |= ((uint8_t)(((uint32_t)src[3]) * 0x1 / 0xff)) & 0xff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= ((uint8_t)(((uint32_t)src[2]) * 0x1 / 0xff)) & 0xff;
         value |= (uint32_t)(((uint8_t)(((uint32_t)src[1]) * 0x1 / 0xff)) & 0xff) << 8;
         value |= (uint32_t)(((uint8_t)(((uint32_t)src[0]) * 0x1 / 0xff)) & 0xff) << 16;
         value |= (uint32_t)((uint8_t)(((uint32_t)src[3]) * 0x1 / 0xff)) << 24;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_a8b8g8r8_uscaled_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t a;
         uint32_t b;
         uint32_t g;
         uint32_t r;
         a = value >> 24;
         b = (value >> 16) & 0xff;
         g = (value >> 8) & 0xff;
         r = (value) & 0xff;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = (float)b; /* b */
         dst[3] = (float)a; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t a;
         uint32_t b;
         uint32_t g;
         uint32_t r;
         a = (value) & 0xff;
         b = (value >> 8) & 0xff;
         g = (value >> 16) & 0xff;
         r = value >> 24;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = (float)b; /* b */
         dst[3] = (float)a; /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_a8b8g8r8_uscaled_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint8_t)CLAMP(src[3], 0.0f, 255.0f)) << 24;
         value |= (uint32_t)(((uint8_t)CLAMP(src[2], 0.0f, 255.0f)) & 0xff) << 16;
         value |= (uint32_t)(((uint8_t)CLAMP(src[1], 0.0f, 255.0f)) & 0xff) << 8;
         value |= ((uint8_t)CLAMP(src[0], 0.0f, 255.0f)) & 0xff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= ((uint8_t)CLAMP(src[3], 0.0f, 255.0f)) & 0xff;
         value |= (uint32_t)(((uint8_t)CLAMP(src[2], 0.0f, 255.0f)) & 0xff) << 8;
         value |= (uint32_t)(((uint8_t)CLAMP(src[1], 0.0f, 255.0f)) & 0xff) << 16;
         value |= (uint32_t)((uint8_t)CLAMP(src[0], 0.0f, 255.0f)) << 24;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_a8b8g8r8_uscaled_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t a;
         uint32_t b;
         uint32_t g;
         uint32_t r;
         a = value >> 24;
         b = (value >> 16) & 0xff;
         g = (value >> 8) & 0xff;
         r = (value) & 0xff;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = (float)b; /* b */
         dst[3] = (float)a; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t a;
         uint32_t b;
         uint32_t g;
         uint32_t r;
         a = (value) & 0xff;
         b = (value >> 8) & 0xff;
         g = (value >> 16) & 0xff;
         r = value >> 24;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = (float)b; /* b */
         dst[3] = (float)a; /* a */
#endif
}

static inline void
util_format_a8b8g8r8_uscaled_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t a;
         uint32_t b;
         uint32_t g;
         uint32_t r;
         a = value >> 24;
         b = (value >> 16) & 0xff;
         g = (value >> 8) & 0xff;
         r = (value) & 0xff;
         dst[0] = (uint8_t)(((uint32_t)MIN2(r, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint32_t)MIN2(g, 1)) * 0xff / 0x1); /* g */
         dst[2] = (uint8_t)(((uint32_t)MIN2(b, 1)) * 0xff / 0x1); /* b */
         dst[3] = (uint8_t)(((uint32_t)MIN2(a, 1)) * 0xff / 0x1); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t a;
         uint32_t b;
         uint32_t g;
         uint32_t r;
         a = (value) & 0xff;
         b = (value >> 8) & 0xff;
         g = (value >> 16) & 0xff;
         r = value >> 24;
         dst[0] = (uint8_t)(((uint32_t)MIN2(r, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint32_t)MIN2(g, 1)) * 0xff / 0x1); /* g */
         dst[2] = (uint8_t)(((uint32_t)MIN2(b, 1)) * 0xff / 0x1); /* b */
         dst[3] = (uint8_t)(((uint32_t)MIN2(a, 1)) * 0xff / 0x1); /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_a8b8g8r8_uscaled_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint8_t)(((uint32_t)src[3]) * 0x1 / 0xff)) << 24;
         value |= (uint32_t)(((uint8_t)(((uint32_t)src[2]) * 0x1 / 0xff)) & 0xff) << 16;
         value |= (uint32_t)(((uint8_t)(((uint32_t)src[1]) * 0x1 / 0xff)) & 0xff) << 8;
         value |= ((uint8_t)(((uint32_t)src[0]) * 0x1 / 0xff)) & 0xff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= ((uint8_t)(((uint32_t)src[3]) * 0x1 / 0xff)) & 0xff;
         value |= (uint32_t)(((uint8_t)(((uint32_t)src[2]) * 0x1 / 0xff)) & 0xff) << 8;
         value |= (uint32_t)(((uint8_t)(((uint32_t)src[1]) * 0x1 / 0xff)) & 0xff) << 16;
         value |= (uint32_t)((uint8_t)(((uint32_t)src[0]) * 0x1 / 0xff)) << 24;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r8_snorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = *(const uint8_t *)src;
         int8_t r;
         r = (int8_t)(value) ;
         dst[0] = (float)(r * (1.0f/0x7f)); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
         src += 1;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r8_snorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = 0;
         value |= (uint8_t)((int8_t)util_iround(CLAMP(src[0], -1.0f, 1.0f) * 0x7f)) ;
         *(uint8_t *)dst = value;
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r8_snorm_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
         uint8_t value = *(const uint8_t *)src;
         int8_t r;
         r = (int8_t)(value) ;
         dst[0] = (float)(r * (1.0f/0x7f)); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
}

static inline void
util_format_r8_snorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = *(const uint8_t *)src;
         int8_t r;
         r = (int8_t)(value) ;
         dst[0] = (uint8_t)(((uint32_t)MAX2(r, 0)) * 0xff / 0x7f); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 255; /* a */
         src += 1;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r8_snorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = 0;
         value |= (uint8_t)((int8_t)(src[0] >> 1)) ;
         *(uint8_t *)dst = value;
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r8g8_snorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         int16_t r;
         int16_t g;
         r = ((int16_t)(value) ) >> 8;
         g = ((int16_t)(value << 8) ) >> 8;
         dst[0] = (float)(r * (1.0f/0x7f)); /* r */
         dst[1] = (float)(g * (1.0f/0x7f)); /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         int16_t r;
         int16_t g;
         r = ((int16_t)(value << 8) ) >> 8;
         g = ((int16_t)(value) ) >> 8;
         dst[0] = (float)(r * (1.0f/0x7f)); /* r */
         dst[1] = (float)(g * (1.0f/0x7f)); /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#endif
         src += 2;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r8g8_snorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint16_t)((uint32_t)((int8_t)util_iround(CLAMP(src[0], -1.0f, 1.0f) * 0x7f)) << 8) ;
         value |= (uint16_t)(((int8_t)util_iround(CLAMP(src[1], -1.0f, 1.0f) * 0x7f)) & 0xff) ;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= (uint16_t)(((int8_t)util_iround(CLAMP(src[0], -1.0f, 1.0f) * 0x7f)) & 0xff) ;
         value |= (uint16_t)((uint32_t)((int8_t)util_iround(CLAMP(src[1], -1.0f, 1.0f) * 0x7f)) << 8) ;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r8g8_snorm_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         int16_t r;
         int16_t g;
         r = ((int16_t)(value) ) >> 8;
         g = ((int16_t)(value << 8) ) >> 8;
         dst[0] = (float)(r * (1.0f/0x7f)); /* r */
         dst[1] = (float)(g * (1.0f/0x7f)); /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         int16_t r;
         int16_t g;
         r = ((int16_t)(value << 8) ) >> 8;
         g = ((int16_t)(value) ) >> 8;
         dst[0] = (float)(r * (1.0f/0x7f)); /* r */
         dst[1] = (float)(g * (1.0f/0x7f)); /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#endif
}

static inline void
util_format_r8g8_snorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         int16_t r;
         int16_t g;
         r = ((int16_t)(value) ) >> 8;
         g = ((int16_t)(value << 8) ) >> 8;
         dst[0] = (uint8_t)(((uint32_t)MAX2(r, 0)) * 0xff / 0x7f); /* r */
         dst[1] = (uint8_t)(((uint32_t)MAX2(g, 0)) * 0xff / 0x7f); /* g */
         dst[2] = 0; /* b */
         dst[3] = 255; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         int16_t r;
         int16_t g;
         r = ((int16_t)(value << 8) ) >> 8;
         g = ((int16_t)(value) ) >> 8;
         dst[0] = (uint8_t)(((uint32_t)MAX2(r, 0)) * 0xff / 0x7f); /* r */
         dst[1] = (uint8_t)(((uint32_t)MAX2(g, 0)) * 0xff / 0x7f); /* g */
         dst[2] = 0; /* b */
         dst[3] = 255; /* a */
#endif
         src += 2;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r8g8_snorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint16_t)((uint32_t)((int8_t)(src[0] >> 1)) << 8) ;
         value |= (uint16_t)(((int8_t)(src[1] >> 1)) & 0xff) ;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= (uint16_t)(((int8_t)(src[0] >> 1)) & 0xff) ;
         value |= (uint16_t)((uint32_t)((int8_t)(src[1] >> 1)) << 8) ;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r8g8b8_snorm {
#if UTIL_ARCH_BIG_ENDIAN
   int8_t r;
   int8_t g;
   int8_t b;
#else
   int8_t r;
   int8_t g;
   int8_t b;
#endif
};

static inline void
util_format_r8g8b8_snorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r8g8b8_snorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0f/0x7f)); /* r */
         dst[1] = (float)(pixel.g * (1.0f/0x7f)); /* g */
         dst[2] = (float)(pixel.b * (1.0f/0x7f)); /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r8g8b8_snorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0f/0x7f)); /* r */
         dst[1] = (float)(pixel.g * (1.0f/0x7f)); /* g */
         dst[2] = (float)(pixel.b * (1.0f/0x7f)); /* b */
         dst[3] = 1; /* a */
#endif
         src += 3;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r8g8b8_snorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r8g8b8_snorm pixel;
         pixel.r = (int8_t)util_iround(CLAMP(src[0], -1.0f, 1.0f) * 0x7f);
         pixel.g = (int8_t)util_iround(CLAMP(src[1], -1.0f, 1.0f) * 0x7f);
         pixel.b = (int8_t)util_iround(CLAMP(src[2], -1.0f, 1.0f) * 0x7f);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r8g8b8_snorm pixel;
         pixel.r = (int8_t)util_iround(CLAMP(src[0], -1.0f, 1.0f) * 0x7f);
         pixel.g = (int8_t)util_iround(CLAMP(src[1], -1.0f, 1.0f) * 0x7f);
         pixel.b = (int8_t)util_iround(CLAMP(src[2], -1.0f, 1.0f) * 0x7f);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 3;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r8g8b8_snorm_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r8g8b8_snorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0f/0x7f)); /* r */
         dst[1] = (float)(pixel.g * (1.0f/0x7f)); /* g */
         dst[2] = (float)(pixel.b * (1.0f/0x7f)); /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r8g8b8_snorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0f/0x7f)); /* r */
         dst[1] = (float)(pixel.g * (1.0f/0x7f)); /* g */
         dst[2] = (float)(pixel.b * (1.0f/0x7f)); /* b */
         dst[3] = 1; /* a */
#endif
}

static inline void
util_format_r8g8b8_snorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r8g8b8_snorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)(((uint32_t)MAX2(pixel.r, 0)) * 0xff / 0x7f); /* r */
         dst[1] = (uint8_t)(((uint32_t)MAX2(pixel.g, 0)) * 0xff / 0x7f); /* g */
         dst[2] = (uint8_t)(((uint32_t)MAX2(pixel.b, 0)) * 0xff / 0x7f); /* b */
         dst[3] = 255; /* a */
#else
         struct util_format_r8g8b8_snorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)(((uint32_t)MAX2(pixel.r, 0)) * 0xff / 0x7f); /* r */
         dst[1] = (uint8_t)(((uint32_t)MAX2(pixel.g, 0)) * 0xff / 0x7f); /* g */
         dst[2] = (uint8_t)(((uint32_t)MAX2(pixel.b, 0)) * 0xff / 0x7f); /* b */
         dst[3] = 255; /* a */
#endif
         src += 3;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r8g8b8_snorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r8g8b8_snorm pixel;
         pixel.r = (int8_t)(src[0] >> 1);
         pixel.g = (int8_t)(src[1] >> 1);
         pixel.b = (int8_t)(src[2] >> 1);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r8g8b8_snorm pixel;
         pixel.r = (int8_t)(src[0] >> 1);
         pixel.g = (int8_t)(src[1] >> 1);
         pixel.b = (int8_t)(src[2] >> 1);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 3;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_b8g8r8_snorm {
#if UTIL_ARCH_BIG_ENDIAN
   int8_t b;
   int8_t g;
   int8_t r;
#else
   int8_t b;
   int8_t g;
   int8_t r;
#endif
};

static inline void
util_format_b8g8r8_snorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_b8g8r8_snorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0f/0x7f)); /* r */
         dst[1] = (float)(pixel.g * (1.0f/0x7f)); /* g */
         dst[2] = (float)(pixel.b * (1.0f/0x7f)); /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_b8g8r8_snorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0f/0x7f)); /* r */
         dst[1] = (float)(pixel.g * (1.0f/0x7f)); /* g */
         dst[2] = (float)(pixel.b * (1.0f/0x7f)); /* b */
         dst[3] = 1; /* a */
#endif
         src += 3;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_b8g8r8_snorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_b8g8r8_snorm pixel;
         pixel.b = (int8_t)util_iround(CLAMP(src[2], -1.0f, 1.0f) * 0x7f);
         pixel.g = (int8_t)util_iround(CLAMP(src[1], -1.0f, 1.0f) * 0x7f);
         pixel.r = (int8_t)util_iround(CLAMP(src[0], -1.0f, 1.0f) * 0x7f);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_b8g8r8_snorm pixel;
         pixel.b = (int8_t)util_iround(CLAMP(src[2], -1.0f, 1.0f) * 0x7f);
         pixel.g = (int8_t)util_iround(CLAMP(src[1], -1.0f, 1.0f) * 0x7f);
         pixel.r = (int8_t)util_iround(CLAMP(src[0], -1.0f, 1.0f) * 0x7f);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 3;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_b8g8r8_snorm_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_b8g8r8_snorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0f/0x7f)); /* r */
         dst[1] = (float)(pixel.g * (1.0f/0x7f)); /* g */
         dst[2] = (float)(pixel.b * (1.0f/0x7f)); /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_b8g8r8_snorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0f/0x7f)); /* r */
         dst[1] = (float)(pixel.g * (1.0f/0x7f)); /* g */
         dst[2] = (float)(pixel.b * (1.0f/0x7f)); /* b */
         dst[3] = 1; /* a */
#endif
}

static inline void
util_format_b8g8r8_snorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_b8g8r8_snorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)(((uint32_t)MAX2(pixel.r, 0)) * 0xff / 0x7f); /* r */
         dst[1] = (uint8_t)(((uint32_t)MAX2(pixel.g, 0)) * 0xff / 0x7f); /* g */
         dst[2] = (uint8_t)(((uint32_t)MAX2(pixel.b, 0)) * 0xff / 0x7f); /* b */
         dst[3] = 255; /* a */
#else
         struct util_format_b8g8r8_snorm pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)(((uint32_t)MAX2(pixel.r, 0)) * 0xff / 0x7f); /* r */
         dst[1] = (uint8_t)(((uint32_t)MAX2(pixel.g, 0)) * 0xff / 0x7f); /* g */
         dst[2] = (uint8_t)(((uint32_t)MAX2(pixel.b, 0)) * 0xff / 0x7f); /* b */
         dst[3] = 255; /* a */
#endif
         src += 3;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_b8g8r8_snorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_b8g8r8_snorm pixel;
         pixel.b = (int8_t)(src[2] >> 1);
         pixel.g = (int8_t)(src[1] >> 1);
         pixel.r = (int8_t)(src[0] >> 1);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_b8g8r8_snorm pixel;
         pixel.b = (int8_t)(src[2] >> 1);
         pixel.g = (int8_t)(src[1] >> 1);
         pixel.r = (int8_t)(src[0] >> 1);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 3;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r8g8b8a8_snorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t r;
         int32_t g;
         int32_t b;
         int32_t a;
         r = ((int32_t)(value) ) >> 24;
         g = ((int32_t)(value << 8) ) >> 24;
         b = ((int32_t)(value << 16) ) >> 24;
         a = ((int32_t)(value << 24) ) >> 24;
         dst[0] = (float)(r * (1.0f/0x7f)); /* r */
         dst[1] = (float)(g * (1.0f/0x7f)); /* g */
         dst[2] = (float)(b * (1.0f/0x7f)); /* b */
         dst[3] = (float)(a * (1.0f/0x7f)); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t r;
         int32_t g;
         int32_t b;
         int32_t a;
         r = ((int32_t)(value << 24) ) >> 24;
         g = ((int32_t)(value << 16) ) >> 24;
         b = ((int32_t)(value << 8) ) >> 24;
         a = ((int32_t)(value) ) >> 24;
         dst[0] = (float)(r * (1.0f/0x7f)); /* r */
         dst[1] = (float)(g * (1.0f/0x7f)); /* g */
         dst[2] = (float)(b * (1.0f/0x7f)); /* b */
         dst[3] = (float)(a * (1.0f/0x7f)); /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r8g8b8a8_snorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)((int8_t)util_iround(CLAMP(src[0], -1.0f, 1.0f) * 0x7f)) << 24) ;
         value |= (uint32_t)((uint32_t)(((int8_t)util_iround(CLAMP(src[1], -1.0f, 1.0f) * 0x7f)) & 0xff) << 16) ;
         value |= (uint32_t)((uint32_t)(((int8_t)util_iround(CLAMP(src[2], -1.0f, 1.0f) * 0x7f)) & 0xff) << 8) ;
         value |= (uint32_t)(((int8_t)util_iround(CLAMP(src[3], -1.0f, 1.0f) * 0x7f)) & 0xff) ;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (uint32_t)(((int8_t)util_iround(CLAMP(src[0], -1.0f, 1.0f) * 0x7f)) & 0xff) ;
         value |= (uint32_t)((uint32_t)(((int8_t)util_iround(CLAMP(src[1], -1.0f, 1.0f) * 0x7f)) & 0xff) << 8) ;
         value |= (uint32_t)((uint32_t)(((int8_t)util_iround(CLAMP(src[2], -1.0f, 1.0f) * 0x7f)) & 0xff) << 16) ;
         value |= (uint32_t)((uint32_t)((int8_t)util_iround(CLAMP(src[3], -1.0f, 1.0f) * 0x7f)) << 24) ;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r8g8b8a8_snorm_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t r;
         int32_t g;
         int32_t b;
         int32_t a;
         r = ((int32_t)(value) ) >> 24;
         g = ((int32_t)(value << 8) ) >> 24;
         b = ((int32_t)(value << 16) ) >> 24;
         a = ((int32_t)(value << 24) ) >> 24;
         dst[0] = (float)(r * (1.0f/0x7f)); /* r */
         dst[1] = (float)(g * (1.0f/0x7f)); /* g */
         dst[2] = (float)(b * (1.0f/0x7f)); /* b */
         dst[3] = (float)(a * (1.0f/0x7f)); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t r;
         int32_t g;
         int32_t b;
         int32_t a;
         r = ((int32_t)(value << 24) ) >> 24;
         g = ((int32_t)(value << 16) ) >> 24;
         b = ((int32_t)(value << 8) ) >> 24;
         a = ((int32_t)(value) ) >> 24;
         dst[0] = (float)(r * (1.0f/0x7f)); /* r */
         dst[1] = (float)(g * (1.0f/0x7f)); /* g */
         dst[2] = (float)(b * (1.0f/0x7f)); /* b */
         dst[3] = (float)(a * (1.0f/0x7f)); /* a */
#endif
}

static inline void
util_format_r8g8b8a8_snorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t r;
         int32_t g;
         int32_t b;
         int32_t a;
         r = ((int32_t)(value) ) >> 24;
         g = ((int32_t)(value << 8) ) >> 24;
         b = ((int32_t)(value << 16) ) >> 24;
         a = ((int32_t)(value << 24) ) >> 24;
         dst[0] = (uint8_t)(((uint32_t)MAX2(r, 0)) * 0xff / 0x7f); /* r */
         dst[1] = (uint8_t)(((uint32_t)MAX2(g, 0)) * 0xff / 0x7f); /* g */
         dst[2] = (uint8_t)(((uint32_t)MAX2(b, 0)) * 0xff / 0x7f); /* b */
         dst[3] = (uint8_t)(((uint32_t)MAX2(a, 0)) * 0xff / 0x7f); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t r;
         int32_t g;
         int32_t b;
         int32_t a;
         r = ((int32_t)(value << 24) ) >> 24;
         g = ((int32_t)(value << 16) ) >> 24;
         b = ((int32_t)(value << 8) ) >> 24;
         a = ((int32_t)(value) ) >> 24;
         dst[0] = (uint8_t)(((uint32_t)MAX2(r, 0)) * 0xff / 0x7f); /* r */
         dst[1] = (uint8_t)(((uint32_t)MAX2(g, 0)) * 0xff / 0x7f); /* g */
         dst[2] = (uint8_t)(((uint32_t)MAX2(b, 0)) * 0xff / 0x7f); /* b */
         dst[3] = (uint8_t)(((uint32_t)MAX2(a, 0)) * 0xff / 0x7f); /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r8g8b8a8_snorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)((int8_t)(src[0] >> 1)) << 24) ;
         value |= (uint32_t)((uint32_t)(((int8_t)(src[1] >> 1)) & 0xff) << 16) ;
         value |= (uint32_t)((uint32_t)(((int8_t)(src[2] >> 1)) & 0xff) << 8) ;
         value |= (uint32_t)(((int8_t)(src[3] >> 1)) & 0xff) ;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (uint32_t)(((int8_t)(src[0] >> 1)) & 0xff) ;
         value |= (uint32_t)((uint32_t)(((int8_t)(src[1] >> 1)) & 0xff) << 8) ;
         value |= (uint32_t)((uint32_t)(((int8_t)(src[2] >> 1)) & 0xff) << 16) ;
         value |= (uint32_t)((uint32_t)((int8_t)(src[3] >> 1)) << 24) ;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_b8g8r8a8_snorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t b;
         int32_t g;
         int32_t r;
         int32_t a;
         b = ((int32_t)(value) ) >> 24;
         g = ((int32_t)(value << 8) ) >> 24;
         r = ((int32_t)(value << 16) ) >> 24;
         a = ((int32_t)(value << 24) ) >> 24;
         dst[0] = (float)(r * (1.0f/0x7f)); /* r */
         dst[1] = (float)(g * (1.0f/0x7f)); /* g */
         dst[2] = (float)(b * (1.0f/0x7f)); /* b */
         dst[3] = (float)(a * (1.0f/0x7f)); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t b;
         int32_t g;
         int32_t r;
         int32_t a;
         b = ((int32_t)(value << 24) ) >> 24;
         g = ((int32_t)(value << 16) ) >> 24;
         r = ((int32_t)(value << 8) ) >> 24;
         a = ((int32_t)(value) ) >> 24;
         dst[0] = (float)(r * (1.0f/0x7f)); /* r */
         dst[1] = (float)(g * (1.0f/0x7f)); /* g */
         dst[2] = (float)(b * (1.0f/0x7f)); /* b */
         dst[3] = (float)(a * (1.0f/0x7f)); /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_b8g8r8a8_snorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)((int8_t)util_iround(CLAMP(src[2], -1.0f, 1.0f) * 0x7f)) << 24) ;
         value |= (uint32_t)((uint32_t)(((int8_t)util_iround(CLAMP(src[1], -1.0f, 1.0f) * 0x7f)) & 0xff) << 16) ;
         value |= (uint32_t)((uint32_t)(((int8_t)util_iround(CLAMP(src[0], -1.0f, 1.0f) * 0x7f)) & 0xff) << 8) ;
         value |= (uint32_t)(((int8_t)util_iround(CLAMP(src[3], -1.0f, 1.0f) * 0x7f)) & 0xff) ;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (uint32_t)(((int8_t)util_iround(CLAMP(src[2], -1.0f, 1.0f) * 0x7f)) & 0xff) ;
         value |= (uint32_t)((uint32_t)(((int8_t)util_iround(CLAMP(src[1], -1.0f, 1.0f) * 0x7f)) & 0xff) << 8) ;
         value |= (uint32_t)((uint32_t)(((int8_t)util_iround(CLAMP(src[0], -1.0f, 1.0f) * 0x7f)) & 0xff) << 16) ;
         value |= (uint32_t)((uint32_t)((int8_t)util_iround(CLAMP(src[3], -1.0f, 1.0f) * 0x7f)) << 24) ;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_b8g8r8a8_snorm_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t b;
         int32_t g;
         int32_t r;
         int32_t a;
         b = ((int32_t)(value) ) >> 24;
         g = ((int32_t)(value << 8) ) >> 24;
         r = ((int32_t)(value << 16) ) >> 24;
         a = ((int32_t)(value << 24) ) >> 24;
         dst[0] = (float)(r * (1.0f/0x7f)); /* r */
         dst[1] = (float)(g * (1.0f/0x7f)); /* g */
         dst[2] = (float)(b * (1.0f/0x7f)); /* b */
         dst[3] = (float)(a * (1.0f/0x7f)); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t b;
         int32_t g;
         int32_t r;
         int32_t a;
         b = ((int32_t)(value << 24) ) >> 24;
         g = ((int32_t)(value << 16) ) >> 24;
         r = ((int32_t)(value << 8) ) >> 24;
         a = ((int32_t)(value) ) >> 24;
         dst[0] = (float)(r * (1.0f/0x7f)); /* r */
         dst[1] = (float)(g * (1.0f/0x7f)); /* g */
         dst[2] = (float)(b * (1.0f/0x7f)); /* b */
         dst[3] = (float)(a * (1.0f/0x7f)); /* a */
#endif
}

static inline void
util_format_b8g8r8a8_snorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t b;
         int32_t g;
         int32_t r;
         int32_t a;
         b = ((int32_t)(value) ) >> 24;
         g = ((int32_t)(value << 8) ) >> 24;
         r = ((int32_t)(value << 16) ) >> 24;
         a = ((int32_t)(value << 24) ) >> 24;
         dst[0] = (uint8_t)(((uint32_t)MAX2(r, 0)) * 0xff / 0x7f); /* r */
         dst[1] = (uint8_t)(((uint32_t)MAX2(g, 0)) * 0xff / 0x7f); /* g */
         dst[2] = (uint8_t)(((uint32_t)MAX2(b, 0)) * 0xff / 0x7f); /* b */
         dst[3] = (uint8_t)(((uint32_t)MAX2(a, 0)) * 0xff / 0x7f); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t b;
         int32_t g;
         int32_t r;
         int32_t a;
         b = ((int32_t)(value << 24) ) >> 24;
         g = ((int32_t)(value << 16) ) >> 24;
         r = ((int32_t)(value << 8) ) >> 24;
         a = ((int32_t)(value) ) >> 24;
         dst[0] = (uint8_t)(((uint32_t)MAX2(r, 0)) * 0xff / 0x7f); /* r */
         dst[1] = (uint8_t)(((uint32_t)MAX2(g, 0)) * 0xff / 0x7f); /* g */
         dst[2] = (uint8_t)(((uint32_t)MAX2(b, 0)) * 0xff / 0x7f); /* b */
         dst[3] = (uint8_t)(((uint32_t)MAX2(a, 0)) * 0xff / 0x7f); /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_b8g8r8a8_snorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)((int8_t)(src[2] >> 1)) << 24) ;
         value |= (uint32_t)((uint32_t)(((int8_t)(src[1] >> 1)) & 0xff) << 16) ;
         value |= (uint32_t)((uint32_t)(((int8_t)(src[0] >> 1)) & 0xff) << 8) ;
         value |= (uint32_t)(((int8_t)(src[3] >> 1)) & 0xff) ;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (uint32_t)(((int8_t)(src[2] >> 1)) & 0xff) ;
         value |= (uint32_t)((uint32_t)(((int8_t)(src[1] >> 1)) & 0xff) << 8) ;
         value |= (uint32_t)((uint32_t)(((int8_t)(src[0] >> 1)) & 0xff) << 16) ;
         value |= (uint32_t)((uint32_t)((int8_t)(src[3] >> 1)) << 24) ;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r8_sscaled_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = *(const uint8_t *)src;
         int8_t r;
         r = (int8_t)(value) ;
         dst[0] = (float)r; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
         src += 1;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r8_sscaled_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = 0;
         value |= (uint8_t)((int8_t)CLAMP(src[0], -128.0f, 127.0f)) ;
         *(uint8_t *)dst = value;
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r8_sscaled_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
         uint8_t value = *(const uint8_t *)src;
         int8_t r;
         r = (int8_t)(value) ;
         dst[0] = (float)r; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
}

static inline void
util_format_r8_sscaled_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = *(const uint8_t *)src;
         int8_t r;
         r = (int8_t)(value) ;
         dst[0] = (uint8_t)(((uint32_t)CLAMP(r, 0, 1)) * 0xff / 0x1); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 255; /* a */
         src += 1;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r8_sscaled_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = 0;
         value |= (uint8_t)((int8_t)(((uint32_t)src[0]) * 0x1 / 0xff)) ;
         *(uint8_t *)dst = value;
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r8g8_sscaled_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         int16_t r;
         int16_t g;
         r = ((int16_t)(value) ) >> 8;
         g = ((int16_t)(value << 8) ) >> 8;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         int16_t r;
         int16_t g;
         r = ((int16_t)(value << 8) ) >> 8;
         g = ((int16_t)(value) ) >> 8;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#endif
         src += 2;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r8g8_sscaled_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint16_t)((uint32_t)((int8_t)CLAMP(src[0], -128.0f, 127.0f)) << 8) ;
         value |= (uint16_t)(((int8_t)CLAMP(src[1], -128.0f, 127.0f)) & 0xff) ;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= (uint16_t)(((int8_t)CLAMP(src[0], -128.0f, 127.0f)) & 0xff) ;
         value |= (uint16_t)((uint32_t)((int8_t)CLAMP(src[1], -128.0f, 127.0f)) << 8) ;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r8g8_sscaled_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         int16_t r;
         int16_t g;
         r = ((int16_t)(value) ) >> 8;
         g = ((int16_t)(value << 8) ) >> 8;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         int16_t r;
         int16_t g;
         r = ((int16_t)(value << 8) ) >> 8;
         g = ((int16_t)(value) ) >> 8;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#endif
}

static inline void
util_format_r8g8_sscaled_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         int16_t r;
         int16_t g;
         r = ((int16_t)(value) ) >> 8;
         g = ((int16_t)(value << 8) ) >> 8;
         dst[0] = (uint8_t)(((uint32_t)CLAMP(r, 0, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint32_t)CLAMP(g, 0, 1)) * 0xff / 0x1); /* g */
         dst[2] = 0; /* b */
         dst[3] = 255; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         int16_t r;
         int16_t g;
         r = ((int16_t)(value << 8) ) >> 8;
         g = ((int16_t)(value) ) >> 8;
         dst[0] = (uint8_t)(((uint32_t)CLAMP(r, 0, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint32_t)CLAMP(g, 0, 1)) * 0xff / 0x1); /* g */
         dst[2] = 0; /* b */
         dst[3] = 255; /* a */
#endif
         src += 2;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r8g8_sscaled_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint16_t)((uint32_t)((int8_t)(((uint32_t)src[0]) * 0x1 / 0xff)) << 8) ;
         value |= (uint16_t)(((int8_t)(((uint32_t)src[1]) * 0x1 / 0xff)) & 0xff) ;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= (uint16_t)(((int8_t)(((uint32_t)src[0]) * 0x1 / 0xff)) & 0xff) ;
         value |= (uint16_t)((uint32_t)((int8_t)(((uint32_t)src[1]) * 0x1 / 0xff)) << 8) ;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r8g8b8_sscaled {
#if UTIL_ARCH_BIG_ENDIAN
   int8_t r;
   int8_t g;
   int8_t b;
#else
   int8_t r;
   int8_t g;
   int8_t b;
#endif
};

static inline void
util_format_r8g8b8_sscaled_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r8g8b8_sscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r8g8b8_sscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = 1; /* a */
#endif
         src += 3;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r8g8b8_sscaled_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r8g8b8_sscaled pixel;
         pixel.r = (int8_t)CLAMP(src[0], -128.0f, 127.0f);
         pixel.g = (int8_t)CLAMP(src[1], -128.0f, 127.0f);
         pixel.b = (int8_t)CLAMP(src[2], -128.0f, 127.0f);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r8g8b8_sscaled pixel;
         pixel.r = (int8_t)CLAMP(src[0], -128.0f, 127.0f);
         pixel.g = (int8_t)CLAMP(src[1], -128.0f, 127.0f);
         pixel.b = (int8_t)CLAMP(src[2], -128.0f, 127.0f);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 3;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r8g8b8_sscaled_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r8g8b8_sscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r8g8b8_sscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = 1; /* a */
#endif
}

static inline void
util_format_r8g8b8_sscaled_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r8g8b8_sscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)(((uint32_t)CLAMP(pixel.r, 0, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint32_t)CLAMP(pixel.g, 0, 1)) * 0xff / 0x1); /* g */
         dst[2] = (uint8_t)(((uint32_t)CLAMP(pixel.b, 0, 1)) * 0xff / 0x1); /* b */
         dst[3] = 255; /* a */
#else
         struct util_format_r8g8b8_sscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)(((uint32_t)CLAMP(pixel.r, 0, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint32_t)CLAMP(pixel.g, 0, 1)) * 0xff / 0x1); /* g */
         dst[2] = (uint8_t)(((uint32_t)CLAMP(pixel.b, 0, 1)) * 0xff / 0x1); /* b */
         dst[3] = 255; /* a */
#endif
         src += 3;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r8g8b8_sscaled_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r8g8b8_sscaled pixel;
         pixel.r = (int8_t)(((uint32_t)src[0]) * 0x1 / 0xff);
         pixel.g = (int8_t)(((uint32_t)src[1]) * 0x1 / 0xff);
         pixel.b = (int8_t)(((uint32_t)src[2]) * 0x1 / 0xff);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r8g8b8_sscaled pixel;
         pixel.r = (int8_t)(((uint32_t)src[0]) * 0x1 / 0xff);
         pixel.g = (int8_t)(((uint32_t)src[1]) * 0x1 / 0xff);
         pixel.b = (int8_t)(((uint32_t)src[2]) * 0x1 / 0xff);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 3;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_b8g8r8_sscaled {
#if UTIL_ARCH_BIG_ENDIAN
   int8_t b;
   int8_t g;
   int8_t r;
#else
   int8_t b;
   int8_t g;
   int8_t r;
#endif
};

static inline void
util_format_b8g8r8_sscaled_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_b8g8r8_sscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_b8g8r8_sscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = 1; /* a */
#endif
         src += 3;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_b8g8r8_sscaled_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_b8g8r8_sscaled pixel;
         pixel.b = (int8_t)CLAMP(src[2], -128.0f, 127.0f);
         pixel.g = (int8_t)CLAMP(src[1], -128.0f, 127.0f);
         pixel.r = (int8_t)CLAMP(src[0], -128.0f, 127.0f);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_b8g8r8_sscaled pixel;
         pixel.b = (int8_t)CLAMP(src[2], -128.0f, 127.0f);
         pixel.g = (int8_t)CLAMP(src[1], -128.0f, 127.0f);
         pixel.r = (int8_t)CLAMP(src[0], -128.0f, 127.0f);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 3;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_b8g8r8_sscaled_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_b8g8r8_sscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_b8g8r8_sscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)pixel.r; /* r */
         dst[1] = (float)pixel.g; /* g */
         dst[2] = (float)pixel.b; /* b */
         dst[3] = 1; /* a */
#endif
}

static inline void
util_format_b8g8r8_sscaled_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_b8g8r8_sscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)(((uint32_t)CLAMP(pixel.r, 0, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint32_t)CLAMP(pixel.g, 0, 1)) * 0xff / 0x1); /* g */
         dst[2] = (uint8_t)(((uint32_t)CLAMP(pixel.b, 0, 1)) * 0xff / 0x1); /* b */
         dst[3] = 255; /* a */
#else
         struct util_format_b8g8r8_sscaled pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)(((uint32_t)CLAMP(pixel.r, 0, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint32_t)CLAMP(pixel.g, 0, 1)) * 0xff / 0x1); /* g */
         dst[2] = (uint8_t)(((uint32_t)CLAMP(pixel.b, 0, 1)) * 0xff / 0x1); /* b */
         dst[3] = 255; /* a */
#endif
         src += 3;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_b8g8r8_sscaled_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_b8g8r8_sscaled pixel;
         pixel.b = (int8_t)(((uint32_t)src[2]) * 0x1 / 0xff);
         pixel.g = (int8_t)(((uint32_t)src[1]) * 0x1 / 0xff);
         pixel.r = (int8_t)(((uint32_t)src[0]) * 0x1 / 0xff);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_b8g8r8_sscaled pixel;
         pixel.b = (int8_t)(((uint32_t)src[2]) * 0x1 / 0xff);
         pixel.g = (int8_t)(((uint32_t)src[1]) * 0x1 / 0xff);
         pixel.r = (int8_t)(((uint32_t)src[0]) * 0x1 / 0xff);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 3;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r8g8b8a8_sscaled_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t r;
         int32_t g;
         int32_t b;
         int32_t a;
         r = ((int32_t)(value) ) >> 24;
         g = ((int32_t)(value << 8) ) >> 24;
         b = ((int32_t)(value << 16) ) >> 24;
         a = ((int32_t)(value << 24) ) >> 24;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = (float)b; /* b */
         dst[3] = (float)a; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t r;
         int32_t g;
         int32_t b;
         int32_t a;
         r = ((int32_t)(value << 24) ) >> 24;
         g = ((int32_t)(value << 16) ) >> 24;
         b = ((int32_t)(value << 8) ) >> 24;
         a = ((int32_t)(value) ) >> 24;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = (float)b; /* b */
         dst[3] = (float)a; /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r8g8b8a8_sscaled_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)((int8_t)CLAMP(src[0], -128.0f, 127.0f)) << 24) ;
         value |= (uint32_t)((uint32_t)(((int8_t)CLAMP(src[1], -128.0f, 127.0f)) & 0xff) << 16) ;
         value |= (uint32_t)((uint32_t)(((int8_t)CLAMP(src[2], -128.0f, 127.0f)) & 0xff) << 8) ;
         value |= (uint32_t)(((int8_t)CLAMP(src[3], -128.0f, 127.0f)) & 0xff) ;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (uint32_t)(((int8_t)CLAMP(src[0], -128.0f, 127.0f)) & 0xff) ;
         value |= (uint32_t)((uint32_t)(((int8_t)CLAMP(src[1], -128.0f, 127.0f)) & 0xff) << 8) ;
         value |= (uint32_t)((uint32_t)(((int8_t)CLAMP(src[2], -128.0f, 127.0f)) & 0xff) << 16) ;
         value |= (uint32_t)((uint32_t)((int8_t)CLAMP(src[3], -128.0f, 127.0f)) << 24) ;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r8g8b8a8_sscaled_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t r;
         int32_t g;
         int32_t b;
         int32_t a;
         r = ((int32_t)(value) ) >> 24;
         g = ((int32_t)(value << 8) ) >> 24;
         b = ((int32_t)(value << 16) ) >> 24;
         a = ((int32_t)(value << 24) ) >> 24;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = (float)b; /* b */
         dst[3] = (float)a; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t r;
         int32_t g;
         int32_t b;
         int32_t a;
         r = ((int32_t)(value << 24) ) >> 24;
         g = ((int32_t)(value << 16) ) >> 24;
         b = ((int32_t)(value << 8) ) >> 24;
         a = ((int32_t)(value) ) >> 24;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = (float)b; /* b */
         dst[3] = (float)a; /* a */
#endif
}

static inline void
util_format_r8g8b8a8_sscaled_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t r;
         int32_t g;
         int32_t b;
         int32_t a;
         r = ((int32_t)(value) ) >> 24;
         g = ((int32_t)(value << 8) ) >> 24;
         b = ((int32_t)(value << 16) ) >> 24;
         a = ((int32_t)(value << 24) ) >> 24;
         dst[0] = (uint8_t)(((uint32_t)CLAMP(r, 0, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint32_t)CLAMP(g, 0, 1)) * 0xff / 0x1); /* g */
         dst[2] = (uint8_t)(((uint32_t)CLAMP(b, 0, 1)) * 0xff / 0x1); /* b */
         dst[3] = (uint8_t)(((uint32_t)CLAMP(a, 0, 1)) * 0xff / 0x1); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t r;
         int32_t g;
         int32_t b;
         int32_t a;
         r = ((int32_t)(value << 24) ) >> 24;
         g = ((int32_t)(value << 16) ) >> 24;
         b = ((int32_t)(value << 8) ) >> 24;
         a = ((int32_t)(value) ) >> 24;
         dst[0] = (uint8_t)(((uint32_t)CLAMP(r, 0, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint32_t)CLAMP(g, 0, 1)) * 0xff / 0x1); /* g */
         dst[2] = (uint8_t)(((uint32_t)CLAMP(b, 0, 1)) * 0xff / 0x1); /* b */
         dst[3] = (uint8_t)(((uint32_t)CLAMP(a, 0, 1)) * 0xff / 0x1); /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r8g8b8a8_sscaled_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)((int8_t)(((uint32_t)src[0]) * 0x1 / 0xff)) << 24) ;
         value |= (uint32_t)((uint32_t)(((int8_t)(((uint32_t)src[1]) * 0x1 / 0xff)) & 0xff) << 16) ;
         value |= (uint32_t)((uint32_t)(((int8_t)(((uint32_t)src[2]) * 0x1 / 0xff)) & 0xff) << 8) ;
         value |= (uint32_t)(((int8_t)(((uint32_t)src[3]) * 0x1 / 0xff)) & 0xff) ;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (uint32_t)(((int8_t)(((uint32_t)src[0]) * 0x1 / 0xff)) & 0xff) ;
         value |= (uint32_t)((uint32_t)(((int8_t)(((uint32_t)src[1]) * 0x1 / 0xff)) & 0xff) << 8) ;
         value |= (uint32_t)((uint32_t)(((int8_t)(((uint32_t)src[2]) * 0x1 / 0xff)) & 0xff) << 16) ;
         value |= (uint32_t)((uint32_t)((int8_t)(((uint32_t)src[3]) * 0x1 / 0xff)) << 24) ;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_b8g8r8a8_sscaled_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t b;
         int32_t g;
         int32_t r;
         int32_t a;
         b = ((int32_t)(value) ) >> 24;
         g = ((int32_t)(value << 8) ) >> 24;
         r = ((int32_t)(value << 16) ) >> 24;
         a = ((int32_t)(value << 24) ) >> 24;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = (float)b; /* b */
         dst[3] = (float)a; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t b;
         int32_t g;
         int32_t r;
         int32_t a;
         b = ((int32_t)(value << 24) ) >> 24;
         g = ((int32_t)(value << 16) ) >> 24;
         r = ((int32_t)(value << 8) ) >> 24;
         a = ((int32_t)(value) ) >> 24;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = (float)b; /* b */
         dst[3] = (float)a; /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_b8g8r8a8_sscaled_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)((int8_t)CLAMP(src[2], -128.0f, 127.0f)) << 24) ;
         value |= (uint32_t)((uint32_t)(((int8_t)CLAMP(src[1], -128.0f, 127.0f)) & 0xff) << 16) ;
         value |= (uint32_t)((uint32_t)(((int8_t)CLAMP(src[0], -128.0f, 127.0f)) & 0xff) << 8) ;
         value |= (uint32_t)(((int8_t)CLAMP(src[3], -128.0f, 127.0f)) & 0xff) ;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (uint32_t)(((int8_t)CLAMP(src[2], -128.0f, 127.0f)) & 0xff) ;
         value |= (uint32_t)((uint32_t)(((int8_t)CLAMP(src[1], -128.0f, 127.0f)) & 0xff) << 8) ;
         value |= (uint32_t)((uint32_t)(((int8_t)CLAMP(src[0], -128.0f, 127.0f)) & 0xff) << 16) ;
         value |= (uint32_t)((uint32_t)((int8_t)CLAMP(src[3], -128.0f, 127.0f)) << 24) ;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_b8g8r8a8_sscaled_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t b;
         int32_t g;
         int32_t r;
         int32_t a;
         b = ((int32_t)(value) ) >> 24;
         g = ((int32_t)(value << 8) ) >> 24;
         r = ((int32_t)(value << 16) ) >> 24;
         a = ((int32_t)(value << 24) ) >> 24;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = (float)b; /* b */
         dst[3] = (float)a; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t b;
         int32_t g;
         int32_t r;
         int32_t a;
         b = ((int32_t)(value << 24) ) >> 24;
         g = ((int32_t)(value << 16) ) >> 24;
         r = ((int32_t)(value << 8) ) >> 24;
         a = ((int32_t)(value) ) >> 24;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = (float)b; /* b */
         dst[3] = (float)a; /* a */
#endif
}

static inline void
util_format_b8g8r8a8_sscaled_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t b;
         int32_t g;
         int32_t r;
         int32_t a;
         b = ((int32_t)(value) ) >> 24;
         g = ((int32_t)(value << 8) ) >> 24;
         r = ((int32_t)(value << 16) ) >> 24;
         a = ((int32_t)(value << 24) ) >> 24;
         dst[0] = (uint8_t)(((uint32_t)CLAMP(r, 0, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint32_t)CLAMP(g, 0, 1)) * 0xff / 0x1); /* g */
         dst[2] = (uint8_t)(((uint32_t)CLAMP(b, 0, 1)) * 0xff / 0x1); /* b */
         dst[3] = (uint8_t)(((uint32_t)CLAMP(a, 0, 1)) * 0xff / 0x1); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t b;
         int32_t g;
         int32_t r;
         int32_t a;
         b = ((int32_t)(value << 24) ) >> 24;
         g = ((int32_t)(value << 16) ) >> 24;
         r = ((int32_t)(value << 8) ) >> 24;
         a = ((int32_t)(value) ) >> 24;
         dst[0] = (uint8_t)(((uint32_t)CLAMP(r, 0, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint32_t)CLAMP(g, 0, 1)) * 0xff / 0x1); /* g */
         dst[2] = (uint8_t)(((uint32_t)CLAMP(b, 0, 1)) * 0xff / 0x1); /* b */
         dst[3] = (uint8_t)(((uint32_t)CLAMP(a, 0, 1)) * 0xff / 0x1); /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_b8g8r8a8_sscaled_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)((int8_t)(((uint32_t)src[2]) * 0x1 / 0xff)) << 24) ;
         value |= (uint32_t)((uint32_t)(((int8_t)(((uint32_t)src[1]) * 0x1 / 0xff)) & 0xff) << 16) ;
         value |= (uint32_t)((uint32_t)(((int8_t)(((uint32_t)src[0]) * 0x1 / 0xff)) & 0xff) << 8) ;
         value |= (uint32_t)(((int8_t)(((uint32_t)src[3]) * 0x1 / 0xff)) & 0xff) ;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (uint32_t)(((int8_t)(((uint32_t)src[2]) * 0x1 / 0xff)) & 0xff) ;
         value |= (uint32_t)((uint32_t)(((int8_t)(((uint32_t)src[1]) * 0x1 / 0xff)) & 0xff) << 8) ;
         value |= (uint32_t)((uint32_t)(((int8_t)(((uint32_t)src[0]) * 0x1 / 0xff)) & 0xff) << 16) ;
         value |= (uint32_t)((uint32_t)((int8_t)(((uint32_t)src[3]) * 0x1 / 0xff)) << 24) ;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_a8b8g8r8_sscaled_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t a;
         int32_t b;
         int32_t g;
         int32_t r;
         a = ((int32_t)(value) ) >> 24;
         b = ((int32_t)(value << 8) ) >> 24;
         g = ((int32_t)(value << 16) ) >> 24;
         r = ((int32_t)(value << 24) ) >> 24;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = (float)b; /* b */
         dst[3] = (float)a; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t a;
         int32_t b;
         int32_t g;
         int32_t r;
         a = ((int32_t)(value << 24) ) >> 24;
         b = ((int32_t)(value << 16) ) >> 24;
         g = ((int32_t)(value << 8) ) >> 24;
         r = ((int32_t)(value) ) >> 24;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = (float)b; /* b */
         dst[3] = (float)a; /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_a8b8g8r8_sscaled_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)((int8_t)CLAMP(src[3], -128.0f, 127.0f)) << 24) ;
         value |= (uint32_t)((uint32_t)(((int8_t)CLAMP(src[2], -128.0f, 127.0f)) & 0xff) << 16) ;
         value |= (uint32_t)((uint32_t)(((int8_t)CLAMP(src[1], -128.0f, 127.0f)) & 0xff) << 8) ;
         value |= (uint32_t)(((int8_t)CLAMP(src[0], -128.0f, 127.0f)) & 0xff) ;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (uint32_t)(((int8_t)CLAMP(src[3], -128.0f, 127.0f)) & 0xff) ;
         value |= (uint32_t)((uint32_t)(((int8_t)CLAMP(src[2], -128.0f, 127.0f)) & 0xff) << 8) ;
         value |= (uint32_t)((uint32_t)(((int8_t)CLAMP(src[1], -128.0f, 127.0f)) & 0xff) << 16) ;
         value |= (uint32_t)((uint32_t)((int8_t)CLAMP(src[0], -128.0f, 127.0f)) << 24) ;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_a8b8g8r8_sscaled_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t a;
         int32_t b;
         int32_t g;
         int32_t r;
         a = ((int32_t)(value) ) >> 24;
         b = ((int32_t)(value << 8) ) >> 24;
         g = ((int32_t)(value << 16) ) >> 24;
         r = ((int32_t)(value << 24) ) >> 24;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = (float)b; /* b */
         dst[3] = (float)a; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t a;
         int32_t b;
         int32_t g;
         int32_t r;
         a = ((int32_t)(value << 24) ) >> 24;
         b = ((int32_t)(value << 16) ) >> 24;
         g = ((int32_t)(value << 8) ) >> 24;
         r = ((int32_t)(value) ) >> 24;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = (float)b; /* b */
         dst[3] = (float)a; /* a */
#endif
}

static inline void
util_format_a8b8g8r8_sscaled_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t a;
         int32_t b;
         int32_t g;
         int32_t r;
         a = ((int32_t)(value) ) >> 24;
         b = ((int32_t)(value << 8) ) >> 24;
         g = ((int32_t)(value << 16) ) >> 24;
         r = ((int32_t)(value << 24) ) >> 24;
         dst[0] = (uint8_t)(((uint32_t)CLAMP(r, 0, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint32_t)CLAMP(g, 0, 1)) * 0xff / 0x1); /* g */
         dst[2] = (uint8_t)(((uint32_t)CLAMP(b, 0, 1)) * 0xff / 0x1); /* b */
         dst[3] = (uint8_t)(((uint32_t)CLAMP(a, 0, 1)) * 0xff / 0x1); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t a;
         int32_t b;
         int32_t g;
         int32_t r;
         a = ((int32_t)(value << 24) ) >> 24;
         b = ((int32_t)(value << 16) ) >> 24;
         g = ((int32_t)(value << 8) ) >> 24;
         r = ((int32_t)(value) ) >> 24;
         dst[0] = (uint8_t)(((uint32_t)CLAMP(r, 0, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint32_t)CLAMP(g, 0, 1)) * 0xff / 0x1); /* g */
         dst[2] = (uint8_t)(((uint32_t)CLAMP(b, 0, 1)) * 0xff / 0x1); /* b */
         dst[3] = (uint8_t)(((uint32_t)CLAMP(a, 0, 1)) * 0xff / 0x1); /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_a8b8g8r8_sscaled_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)((int8_t)(((uint32_t)src[3]) * 0x1 / 0xff)) << 24) ;
         value |= (uint32_t)((uint32_t)(((int8_t)(((uint32_t)src[2]) * 0x1 / 0xff)) & 0xff) << 16) ;
         value |= (uint32_t)((uint32_t)(((int8_t)(((uint32_t)src[1]) * 0x1 / 0xff)) & 0xff) << 8) ;
         value |= (uint32_t)(((int8_t)(((uint32_t)src[0]) * 0x1 / 0xff)) & 0xff) ;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (uint32_t)(((int8_t)(((uint32_t)src[3]) * 0x1 / 0xff)) & 0xff) ;
         value |= (uint32_t)((uint32_t)(((int8_t)(((uint32_t)src[2]) * 0x1 / 0xff)) & 0xff) << 8) ;
         value |= (uint32_t)((uint32_t)(((int8_t)(((uint32_t)src[1]) * 0x1 / 0xff)) & 0xff) << 16) ;
         value |= (uint32_t)((uint32_t)((int8_t)(((uint32_t)src[0]) * 0x1 / 0xff)) << 24) ;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r32_fixed {
   int32_t r;
};

static inline void
util_format_r32_fixed_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         struct util_format_r32_fixed pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0/0x10000)); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r32_fixed_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         struct util_format_r32_fixed pixel;
         pixel.r = (int32_t)(CLAMP(src[0], -65536.0f, 65535.0f) * (double)0x10000);
         memcpy(dst, &pixel, sizeof pixel);
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r32_fixed_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
         struct util_format_r32_fixed pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0/0x10000)); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
}

static inline void
util_format_r32_fixed_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         struct util_format_r32_fixed pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)util_iround((CLAMP(pixel.r, 0, 65536) * (1.0/0x10000)) * 0xff); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 255; /* a */
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r32_fixed_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         struct util_format_r32_fixed pixel;
         pixel.r = (int32_t)((float)(src[0] * (1.0f/0xff)) * (double)0x10000);
         memcpy(dst, &pixel, sizeof pixel);
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r32g32_fixed {
#if UTIL_ARCH_BIG_ENDIAN
   int32_t r;
   int32_t g;
#else
   int32_t r;
   int32_t g;
#endif
};

static inline void
util_format_r32g32_fixed_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32_fixed pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0/0x10000)); /* r */
         dst[1] = (float)(pixel.g * (1.0/0x10000)); /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r32g32_fixed pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0/0x10000)); /* r */
         dst[1] = (float)(pixel.g * (1.0/0x10000)); /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#endif
         src += 8;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r32g32_fixed_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32_fixed pixel;
         pixel.r = (int32_t)(CLAMP(src[0], -65536.0f, 65535.0f) * (double)0x10000);
         pixel.g = (int32_t)(CLAMP(src[1], -65536.0f, 65535.0f) * (double)0x10000);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r32g32_fixed pixel;
         pixel.r = (int32_t)(CLAMP(src[0], -65536.0f, 65535.0f) * (double)0x10000);
         pixel.g = (int32_t)(CLAMP(src[1], -65536.0f, 65535.0f) * (double)0x10000);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 8;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r32g32_fixed_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32_fixed pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0/0x10000)); /* r */
         dst[1] = (float)(pixel.g * (1.0/0x10000)); /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r32g32_fixed pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0/0x10000)); /* r */
         dst[1] = (float)(pixel.g * (1.0/0x10000)); /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#endif
}

static inline void
util_format_r32g32_fixed_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32_fixed pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)util_iround((CLAMP(pixel.r, 0, 65536) * (1.0/0x10000)) * 0xff); /* r */
         dst[1] = (uint8_t)util_iround((CLAMP(pixel.g, 0, 65536) * (1.0/0x10000)) * 0xff); /* g */
         dst[2] = 0; /* b */
         dst[3] = 255; /* a */
#else
         struct util_format_r32g32_fixed pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)util_iround((CLAMP(pixel.r, 0, 65536) * (1.0/0x10000)) * 0xff); /* r */
         dst[1] = (uint8_t)util_iround((CLAMP(pixel.g, 0, 65536) * (1.0/0x10000)) * 0xff); /* g */
         dst[2] = 0; /* b */
         dst[3] = 255; /* a */
#endif
         src += 8;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r32g32_fixed_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32_fixed pixel;
         pixel.r = (int32_t)((float)(src[0] * (1.0f/0xff)) * (double)0x10000);
         pixel.g = (int32_t)((float)(src[1] * (1.0f/0xff)) * (double)0x10000);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r32g32_fixed pixel;
         pixel.r = (int32_t)((float)(src[0] * (1.0f/0xff)) * (double)0x10000);
         pixel.g = (int32_t)((float)(src[1] * (1.0f/0xff)) * (double)0x10000);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 8;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r32g32b32_fixed {
#if UTIL_ARCH_BIG_ENDIAN
   int32_t r;
   int32_t g;
   int32_t b;
#else
   int32_t r;
   int32_t g;
   int32_t b;
#endif
};

static inline void
util_format_r32g32b32_fixed_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32_fixed pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0/0x10000)); /* r */
         dst[1] = (float)(pixel.g * (1.0/0x10000)); /* g */
         dst[2] = (float)(pixel.b * (1.0/0x10000)); /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r32g32b32_fixed pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0/0x10000)); /* r */
         dst[1] = (float)(pixel.g * (1.0/0x10000)); /* g */
         dst[2] = (float)(pixel.b * (1.0/0x10000)); /* b */
         dst[3] = 1; /* a */
#endif
         src += 12;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r32g32b32_fixed_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32_fixed pixel;
         pixel.r = (int32_t)(CLAMP(src[0], -65536.0f, 65535.0f) * (double)0x10000);
         pixel.g = (int32_t)(CLAMP(src[1], -65536.0f, 65535.0f) * (double)0x10000);
         pixel.b = (int32_t)(CLAMP(src[2], -65536.0f, 65535.0f) * (double)0x10000);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r32g32b32_fixed pixel;
         pixel.r = (int32_t)(CLAMP(src[0], -65536.0f, 65535.0f) * (double)0x10000);
         pixel.g = (int32_t)(CLAMP(src[1], -65536.0f, 65535.0f) * (double)0x10000);
         pixel.b = (int32_t)(CLAMP(src[2], -65536.0f, 65535.0f) * (double)0x10000);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 12;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r32g32b32_fixed_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32_fixed pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0/0x10000)); /* r */
         dst[1] = (float)(pixel.g * (1.0/0x10000)); /* g */
         dst[2] = (float)(pixel.b * (1.0/0x10000)); /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r32g32b32_fixed pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0/0x10000)); /* r */
         dst[1] = (float)(pixel.g * (1.0/0x10000)); /* g */
         dst[2] = (float)(pixel.b * (1.0/0x10000)); /* b */
         dst[3] = 1; /* a */
#endif
}

static inline void
util_format_r32g32b32_fixed_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32_fixed pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)util_iround((CLAMP(pixel.r, 0, 65536) * (1.0/0x10000)) * 0xff); /* r */
         dst[1] = (uint8_t)util_iround((CLAMP(pixel.g, 0, 65536) * (1.0/0x10000)) * 0xff); /* g */
         dst[2] = (uint8_t)util_iround((CLAMP(pixel.b, 0, 65536) * (1.0/0x10000)) * 0xff); /* b */
         dst[3] = 255; /* a */
#else
         struct util_format_r32g32b32_fixed pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)util_iround((CLAMP(pixel.r, 0, 65536) * (1.0/0x10000)) * 0xff); /* r */
         dst[1] = (uint8_t)util_iround((CLAMP(pixel.g, 0, 65536) * (1.0/0x10000)) * 0xff); /* g */
         dst[2] = (uint8_t)util_iround((CLAMP(pixel.b, 0, 65536) * (1.0/0x10000)) * 0xff); /* b */
         dst[3] = 255; /* a */
#endif
         src += 12;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r32g32b32_fixed_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32_fixed pixel;
         pixel.r = (int32_t)((float)(src[0] * (1.0f/0xff)) * (double)0x10000);
         pixel.g = (int32_t)((float)(src[1] * (1.0f/0xff)) * (double)0x10000);
         pixel.b = (int32_t)((float)(src[2] * (1.0f/0xff)) * (double)0x10000);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r32g32b32_fixed pixel;
         pixel.r = (int32_t)((float)(src[0] * (1.0f/0xff)) * (double)0x10000);
         pixel.g = (int32_t)((float)(src[1] * (1.0f/0xff)) * (double)0x10000);
         pixel.b = (int32_t)((float)(src[2] * (1.0f/0xff)) * (double)0x10000);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 12;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r32g32b32a32_fixed {
#if UTIL_ARCH_BIG_ENDIAN
   int32_t r;
   int32_t g;
   int32_t b;
   int32_t a;
#else
   int32_t r;
   int32_t g;
   int32_t b;
   int32_t a;
#endif
};

static inline void
util_format_r32g32b32a32_fixed_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32a32_fixed pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0/0x10000)); /* r */
         dst[1] = (float)(pixel.g * (1.0/0x10000)); /* g */
         dst[2] = (float)(pixel.b * (1.0/0x10000)); /* b */
         dst[3] = (float)(pixel.a * (1.0/0x10000)); /* a */
#else
         struct util_format_r32g32b32a32_fixed pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0/0x10000)); /* r */
         dst[1] = (float)(pixel.g * (1.0/0x10000)); /* g */
         dst[2] = (float)(pixel.b * (1.0/0x10000)); /* b */
         dst[3] = (float)(pixel.a * (1.0/0x10000)); /* a */
#endif
         src += 16;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r32g32b32a32_fixed_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32a32_fixed pixel;
         pixel.r = (int32_t)(CLAMP(src[0], -65536.0f, 65535.0f) * (double)0x10000);
         pixel.g = (int32_t)(CLAMP(src[1], -65536.0f, 65535.0f) * (double)0x10000);
         pixel.b = (int32_t)(CLAMP(src[2], -65536.0f, 65535.0f) * (double)0x10000);
         pixel.a = (int32_t)(CLAMP(src[3], -65536.0f, 65535.0f) * (double)0x10000);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r32g32b32a32_fixed pixel;
         pixel.r = (int32_t)(CLAMP(src[0], -65536.0f, 65535.0f) * (double)0x10000);
         pixel.g = (int32_t)(CLAMP(src[1], -65536.0f, 65535.0f) * (double)0x10000);
         pixel.b = (int32_t)(CLAMP(src[2], -65536.0f, 65535.0f) * (double)0x10000);
         pixel.a = (int32_t)(CLAMP(src[3], -65536.0f, 65535.0f) * (double)0x10000);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 16;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r32g32b32a32_fixed_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32a32_fixed pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0/0x10000)); /* r */
         dst[1] = (float)(pixel.g * (1.0/0x10000)); /* g */
         dst[2] = (float)(pixel.b * (1.0/0x10000)); /* b */
         dst[3] = (float)(pixel.a * (1.0/0x10000)); /* a */
#else
         struct util_format_r32g32b32a32_fixed pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (float)(pixel.r * (1.0/0x10000)); /* r */
         dst[1] = (float)(pixel.g * (1.0/0x10000)); /* g */
         dst[2] = (float)(pixel.b * (1.0/0x10000)); /* b */
         dst[3] = (float)(pixel.a * (1.0/0x10000)); /* a */
#endif
}

static inline void
util_format_r32g32b32a32_fixed_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32a32_fixed pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)util_iround((CLAMP(pixel.r, 0, 65536) * (1.0/0x10000)) * 0xff); /* r */
         dst[1] = (uint8_t)util_iround((CLAMP(pixel.g, 0, 65536) * (1.0/0x10000)) * 0xff); /* g */
         dst[2] = (uint8_t)util_iround((CLAMP(pixel.b, 0, 65536) * (1.0/0x10000)) * 0xff); /* b */
         dst[3] = (uint8_t)util_iround((CLAMP(pixel.a, 0, 65536) * (1.0/0x10000)) * 0xff); /* a */
#else
         struct util_format_r32g32b32a32_fixed pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (uint8_t)util_iround((CLAMP(pixel.r, 0, 65536) * (1.0/0x10000)) * 0xff); /* r */
         dst[1] = (uint8_t)util_iround((CLAMP(pixel.g, 0, 65536) * (1.0/0x10000)) * 0xff); /* g */
         dst[2] = (uint8_t)util_iround((CLAMP(pixel.b, 0, 65536) * (1.0/0x10000)) * 0xff); /* b */
         dst[3] = (uint8_t)util_iround((CLAMP(pixel.a, 0, 65536) * (1.0/0x10000)) * 0xff); /* a */
#endif
         src += 16;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r32g32b32a32_fixed_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32a32_fixed pixel;
         pixel.r = (int32_t)((float)(src[0] * (1.0f/0xff)) * (double)0x10000);
         pixel.g = (int32_t)((float)(src[1] * (1.0f/0xff)) * (double)0x10000);
         pixel.b = (int32_t)((float)(src[2] * (1.0f/0xff)) * (double)0x10000);
         pixel.a = (int32_t)((float)(src[3] * (1.0f/0xff)) * (double)0x10000);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r32g32b32a32_fixed pixel;
         pixel.r = (int32_t)((float)(src[0] * (1.0f/0xff)) * (double)0x10000);
         pixel.g = (int32_t)((float)(src[1] * (1.0f/0xff)) * (double)0x10000);
         pixel.b = (int32_t)((float)(src[2] * (1.0f/0xff)) * (double)0x10000);
         pixel.a = (int32_t)((float)(src[3] * (1.0f/0xff)) * (double)0x10000);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 16;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r10g10b10x2_uscaled_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t b;
         uint32_t g;
         uint32_t r;
         b = (value >> 20) & 0x3ff;
         g = (value >> 10) & 0x3ff;
         r = (value) & 0x3ff;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = (float)b; /* b */
         dst[3] = 1; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t r;
         uint32_t g;
         uint32_t b;
         r = (value) & 0x3ff;
         g = (value >> 10) & 0x3ff;
         b = (value >> 20) & 0x3ff;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = (float)b; /* b */
         dst[3] = 1; /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r10g10b10x2_uscaled_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)(((uint32_t)CLAMP(src[2], 0.0f, 1023.0f)) & 0x3ff) << 20;
         value |= (uint32_t)(((uint32_t)CLAMP(src[1], 0.0f, 1023.0f)) & 0x3ff) << 10;
         value |= ((uint32_t)CLAMP(src[0], 0.0f, 1023.0f)) & 0x3ff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= ((uint32_t)CLAMP(src[0], 0.0f, 1023.0f)) & 0x3ff;
         value |= (uint32_t)(((uint32_t)CLAMP(src[1], 0.0f, 1023.0f)) & 0x3ff) << 10;
         value |= (uint32_t)(((uint32_t)CLAMP(src[2], 0.0f, 1023.0f)) & 0x3ff) << 20;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r10g10b10x2_uscaled_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t b;
         uint32_t g;
         uint32_t r;
         b = (value >> 20) & 0x3ff;
         g = (value >> 10) & 0x3ff;
         r = (value) & 0x3ff;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = (float)b; /* b */
         dst[3] = 1; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t r;
         uint32_t g;
         uint32_t b;
         r = (value) & 0x3ff;
         g = (value >> 10) & 0x3ff;
         b = (value >> 20) & 0x3ff;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = (float)b; /* b */
         dst[3] = 1; /* a */
#endif
}

static inline void
util_format_r10g10b10x2_uscaled_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t b;
         uint32_t g;
         uint32_t r;
         b = (value >> 20) & 0x3ff;
         g = (value >> 10) & 0x3ff;
         r = (value) & 0x3ff;
         dst[0] = (uint8_t)(((uint32_t)MIN2(r, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint32_t)MIN2(g, 1)) * 0xff / 0x1); /* g */
         dst[2] = (uint8_t)(((uint32_t)MIN2(b, 1)) * 0xff / 0x1); /* b */
         dst[3] = 255; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t r;
         uint32_t g;
         uint32_t b;
         r = (value) & 0x3ff;
         g = (value >> 10) & 0x3ff;
         b = (value >> 20) & 0x3ff;
         dst[0] = (uint8_t)(((uint32_t)MIN2(r, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint32_t)MIN2(g, 1)) * 0xff / 0x1); /* g */
         dst[2] = (uint8_t)(((uint32_t)MIN2(b, 1)) * 0xff / 0x1); /* b */
         dst[3] = 255; /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r10g10b10x2_uscaled_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)(((uint32_t)(((uint32_t)src[2]) * 0x1 / 0xff)) & 0x3ff) << 20;
         value |= (uint32_t)(((uint32_t)(((uint32_t)src[1]) * 0x1 / 0xff)) & 0x3ff) << 10;
         value |= ((uint32_t)(((uint32_t)src[0]) * 0x1 / 0xff)) & 0x3ff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= ((uint32_t)(((uint32_t)src[0]) * 0x1 / 0xff)) & 0x3ff;
         value |= (uint32_t)(((uint32_t)(((uint32_t)src[1]) * 0x1 / 0xff)) & 0x3ff) << 10;
         value |= (uint32_t)(((uint32_t)(((uint32_t)src[2]) * 0x1 / 0xff)) & 0x3ff) << 20;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r10g10b10x2_snorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t b;
         int32_t g;
         int32_t r;
         b = ((int32_t)(value << 2) ) >> 22;
         g = ((int32_t)(value << 12) ) >> 22;
         r = ((int32_t)(value << 22) ) >> 22;
         dst[0] = (float)(r * (1.0f/0x1ff)); /* r */
         dst[1] = (float)(g * (1.0f/0x1ff)); /* g */
         dst[2] = (float)(b * (1.0f/0x1ff)); /* b */
         dst[3] = 1; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t r;
         int32_t g;
         int32_t b;
         r = ((int32_t)(value << 22) ) >> 22;
         g = ((int32_t)(value << 12) ) >> 22;
         b = ((int32_t)(value << 2) ) >> 22;
         dst[0] = (float)(r * (1.0f/0x1ff)); /* r */
         dst[1] = (float)(g * (1.0f/0x1ff)); /* g */
         dst[2] = (float)(b * (1.0f/0x1ff)); /* b */
         dst[3] = 1; /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r10g10b10x2_snorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)(((uint32_t)util_iround(CLAMP(src[2], -1.0f, 1.0f) * 0x1ff)) & 0x3ff) << 20) ;
         value |= (uint32_t)((uint32_t)(((uint32_t)util_iround(CLAMP(src[1], -1.0f, 1.0f) * 0x1ff)) & 0x3ff) << 10) ;
         value |= (uint32_t)(((uint32_t)util_iround(CLAMP(src[0], -1.0f, 1.0f) * 0x1ff)) & 0x3ff) ;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (uint32_t)(((uint32_t)util_iround(CLAMP(src[0], -1.0f, 1.0f) * 0x1ff)) & 0x3ff) ;
         value |= (uint32_t)((uint32_t)(((uint32_t)util_iround(CLAMP(src[1], -1.0f, 1.0f) * 0x1ff)) & 0x3ff) << 10) ;
         value |= (uint32_t)((uint32_t)(((uint32_t)util_iround(CLAMP(src[2], -1.0f, 1.0f) * 0x1ff)) & 0x3ff) << 20) ;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r10g10b10x2_snorm_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t b;
         int32_t g;
         int32_t r;
         b = ((int32_t)(value << 2) ) >> 22;
         g = ((int32_t)(value << 12) ) >> 22;
         r = ((int32_t)(value << 22) ) >> 22;
         dst[0] = (float)(r * (1.0f/0x1ff)); /* r */
         dst[1] = (float)(g * (1.0f/0x1ff)); /* g */
         dst[2] = (float)(b * (1.0f/0x1ff)); /* b */
         dst[3] = 1; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t r;
         int32_t g;
         int32_t b;
         r = ((int32_t)(value << 22) ) >> 22;
         g = ((int32_t)(value << 12) ) >> 22;
         b = ((int32_t)(value << 2) ) >> 22;
         dst[0] = (float)(r * (1.0f/0x1ff)); /* r */
         dst[1] = (float)(g * (1.0f/0x1ff)); /* g */
         dst[2] = (float)(b * (1.0f/0x1ff)); /* b */
         dst[3] = 1; /* a */
#endif
}

static inline void
util_format_r10g10b10x2_snorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t b;
         int32_t g;
         int32_t r;
         b = ((int32_t)(value << 2) ) >> 22;
         g = ((int32_t)(value << 12) ) >> 22;
         r = ((int32_t)(value << 22) ) >> 22;
         dst[0] = (uint8_t)(MAX2(r, 0) >> 1); /* r */
         dst[1] = (uint8_t)(MAX2(g, 0) >> 1); /* g */
         dst[2] = (uint8_t)(MAX2(b, 0) >> 1); /* b */
         dst[3] = 255; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t r;
         int32_t g;
         int32_t b;
         r = ((int32_t)(value << 22) ) >> 22;
         g = ((int32_t)(value << 12) ) >> 22;
         b = ((int32_t)(value << 2) ) >> 22;
         dst[0] = (uint8_t)(MAX2(r, 0) >> 1); /* r */
         dst[1] = (uint8_t)(MAX2(g, 0) >> 1); /* g */
         dst[2] = (uint8_t)(MAX2(b, 0) >> 1); /* b */
         dst[3] = 255; /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r10g10b10x2_snorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)(((uint32_t)(((uint32_t)src[2]) * 0x1ff / 0xff)) & 0x3ff) << 20) ;
         value |= (uint32_t)((uint32_t)(((uint32_t)(((uint32_t)src[1]) * 0x1ff / 0xff)) & 0x3ff) << 10) ;
         value |= (uint32_t)(((uint32_t)(((uint32_t)src[0]) * 0x1ff / 0xff)) & 0x3ff) ;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (uint32_t)(((uint32_t)(((uint32_t)src[0]) * 0x1ff / 0xff)) & 0x3ff) ;
         value |= (uint32_t)((uint32_t)(((uint32_t)(((uint32_t)src[1]) * 0x1ff / 0xff)) & 0x3ff) << 10) ;
         value |= (uint32_t)((uint32_t)(((uint32_t)(((uint32_t)src[2]) * 0x1ff / 0xff)) & 0x3ff) << 20) ;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_a4r4_unorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint8_t value = *(const uint8_t *)src;
         uint8_t r;
         uint8_t a;
         r = value >> 4;
         a = (value) & 0xf;
         dst[0] = (float)(r * (1.0f/0xf)); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = (float)(a * (1.0f/0xf)); /* a */
#else
         uint8_t value = *(const uint8_t *)src;
         uint8_t a;
         uint8_t r;
         a = (value) & 0xf;
         r = value >> 4;
         dst[0] = (float)(r * (1.0f/0xf)); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = (float)(a * (1.0f/0xf)); /* a */
#endif
         src += 1;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_a4r4_unorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint8_t value = 0;
         value |= (uint32_t)((uint8_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0xf)) << 4;
         value |= ((uint8_t)util_iround(CLAMP(src[3], 0.0f, 1.0f) * 0xf)) & 0xf;
         *(uint8_t *)dst = value;
#else
         uint8_t value = 0;
         value |= ((uint8_t)util_iround(CLAMP(src[3], 0.0f, 1.0f) * 0xf)) & 0xf;
         value |= (uint32_t)((uint8_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0xf)) << 4;
         *(uint8_t *)dst = value;
#endif
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_a4r4_unorm_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         uint8_t value = *(const uint8_t *)src;
         uint8_t r;
         uint8_t a;
         r = value >> 4;
         a = (value) & 0xf;
         dst[0] = (float)(r * (1.0f/0xf)); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = (float)(a * (1.0f/0xf)); /* a */
#else
         uint8_t value = *(const uint8_t *)src;
         uint8_t a;
         uint8_t r;
         a = (value) & 0xf;
         r = value >> 4;
         dst[0] = (float)(r * (1.0f/0xf)); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = (float)(a * (1.0f/0xf)); /* a */
#endif
}

static inline void
util_format_a4r4_unorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint8_t value = *(const uint8_t *)src;
         uint8_t r;
         uint8_t a;
         r = value >> 4;
         a = (value) & 0xf;
         dst[0] = (uint8_t)(((uint32_t)r) * 0xff / 0xf); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = (uint8_t)(((uint32_t)a) * 0xff / 0xf); /* a */
#else
         uint8_t value = *(const uint8_t *)src;
         uint8_t a;
         uint8_t r;
         a = (value) & 0xf;
         r = value >> 4;
         dst[0] = (uint8_t)(((uint32_t)r) * 0xff / 0xf); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = (uint8_t)(((uint32_t)a) * 0xff / 0xf); /* a */
#endif
         src += 1;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_a4r4_unorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint8_t value = 0;
         value |= (uint32_t)((uint8_t)(src[0] >> 4)) << 4;
         value |= ((uint8_t)(src[3] >> 4)) & 0xf;
         *(uint8_t *)dst = value;
#else
         uint8_t value = 0;
         value |= ((uint8_t)(src[3] >> 4)) & 0xf;
         value |= (uint32_t)((uint8_t)(src[0] >> 4)) << 4;
         *(uint8_t *)dst = value;
#endif
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r4a4_unorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint8_t value = *(const uint8_t *)src;
         uint8_t a;
         uint8_t r;
         a = value >> 4;
         r = (value) & 0xf;
         dst[0] = (float)(r * (1.0f/0xf)); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = (float)(a * (1.0f/0xf)); /* a */
#else
         uint8_t value = *(const uint8_t *)src;
         uint8_t r;
         uint8_t a;
         r = (value) & 0xf;
         a = value >> 4;
         dst[0] = (float)(r * (1.0f/0xf)); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = (float)(a * (1.0f/0xf)); /* a */
#endif
         src += 1;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r4a4_unorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint8_t value = 0;
         value |= (uint32_t)((uint8_t)util_iround(CLAMP(src[3], 0.0f, 1.0f) * 0xf)) << 4;
         value |= ((uint8_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0xf)) & 0xf;
         *(uint8_t *)dst = value;
#else
         uint8_t value = 0;
         value |= ((uint8_t)util_iround(CLAMP(src[0], 0.0f, 1.0f) * 0xf)) & 0xf;
         value |= (uint32_t)((uint8_t)util_iround(CLAMP(src[3], 0.0f, 1.0f) * 0xf)) << 4;
         *(uint8_t *)dst = value;
#endif
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r4a4_unorm_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         uint8_t value = *(const uint8_t *)src;
         uint8_t a;
         uint8_t r;
         a = value >> 4;
         r = (value) & 0xf;
         dst[0] = (float)(r * (1.0f/0xf)); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = (float)(a * (1.0f/0xf)); /* a */
#else
         uint8_t value = *(const uint8_t *)src;
         uint8_t r;
         uint8_t a;
         r = (value) & 0xf;
         a = value >> 4;
         dst[0] = (float)(r * (1.0f/0xf)); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = (float)(a * (1.0f/0xf)); /* a */
#endif
}

static inline void
util_format_r4a4_unorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint8_t value = *(const uint8_t *)src;
         uint8_t a;
         uint8_t r;
         a = value >> 4;
         r = (value) & 0xf;
         dst[0] = (uint8_t)(((uint32_t)r) * 0xff / 0xf); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = (uint8_t)(((uint32_t)a) * 0xff / 0xf); /* a */
#else
         uint8_t value = *(const uint8_t *)src;
         uint8_t r;
         uint8_t a;
         r = (value) & 0xf;
         a = value >> 4;
         dst[0] = (uint8_t)(((uint32_t)r) * 0xff / 0xf); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = (uint8_t)(((uint32_t)a) * 0xff / 0xf); /* a */
#endif
         src += 1;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r4a4_unorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint8_t value = 0;
         value |= (uint32_t)((uint8_t)(src[3] >> 4)) << 4;
         value |= ((uint8_t)(src[0] >> 4)) & 0xf;
         *(uint8_t *)dst = value;
#else
         uint8_t value = 0;
         value |= ((uint8_t)(src[0] >> 4)) & 0xf;
         value |= (uint32_t)((uint8_t)(src[3] >> 4)) << 4;
         *(uint8_t *)dst = value;
#endif
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r8a8_unorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t r;
         uint16_t a;
         r = value >> 8;
         a = (value) & 0xff;
         dst[0] = ubyte_to_float(r); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = ubyte_to_float(a); /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t r;
         uint16_t a;
         r = (value) & 0xff;
         a = value >> 8;
         dst[0] = ubyte_to_float(r); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = ubyte_to_float(a); /* a */
#endif
         src += 2;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r8a8_unorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)(float_to_ubyte(src[0])) << 8;
         value |= (float_to_ubyte(src[3])) & 0xff;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= (float_to_ubyte(src[0])) & 0xff;
         value |= (uint32_t)(float_to_ubyte(src[3])) << 8;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r8a8_unorm_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t r;
         uint16_t a;
         r = value >> 8;
         a = (value) & 0xff;
         dst[0] = ubyte_to_float(r); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = ubyte_to_float(a); /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t r;
         uint16_t a;
         r = (value) & 0xff;
         a = value >> 8;
         dst[0] = ubyte_to_float(r); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = ubyte_to_float(a); /* a */
#endif
}

static inline void
util_format_r8a8_unorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t r;
         uint16_t a;
         r = value >> 8;
         a = (value) & 0xff;
         dst[0] = r; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = a; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t r;
         uint16_t a;
         r = (value) & 0xff;
         a = value >> 8;
         dst[0] = r; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = a; /* a */
#endif
         src += 2;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r8a8_unorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)(src[0]) << 8;
         value |= (src[3]) & 0xff;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= (src[0]) & 0xff;
         value |= (uint32_t)(src[3]) << 8;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_a8r8_unorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t a;
         uint16_t r;
         a = value >> 8;
         r = (value) & 0xff;
         dst[0] = ubyte_to_float(r); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = ubyte_to_float(a); /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t a;
         uint16_t r;
         a = (value) & 0xff;
         r = value >> 8;
         dst[0] = ubyte_to_float(r); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = ubyte_to_float(a); /* a */
#endif
         src += 2;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_a8r8_unorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)(float_to_ubyte(src[3])) << 8;
         value |= (float_to_ubyte(src[0])) & 0xff;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= (float_to_ubyte(src[3])) & 0xff;
         value |= (uint32_t)(float_to_ubyte(src[0])) << 8;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_a8r8_unorm_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t a;
         uint16_t r;
         a = value >> 8;
         r = (value) & 0xff;
         dst[0] = ubyte_to_float(r); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = ubyte_to_float(a); /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t a;
         uint16_t r;
         a = (value) & 0xff;
         r = value >> 8;
         dst[0] = ubyte_to_float(r); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = ubyte_to_float(a); /* a */
#endif
}

static inline void
util_format_a8r8_unorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t a;
         uint16_t r;
         a = value >> 8;
         r = (value) & 0xff;
         dst[0] = r; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = a; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t a;
         uint16_t r;
         a = (value) & 0xff;
         r = value >> 8;
         dst[0] = r; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = a; /* a */
#endif
         src += 2;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_a8r8_unorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)(src[3]) << 8;
         value |= (src[0]) & 0xff;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= (src[3]) & 0xff;
         value |= (uint32_t)(src[0]) << 8;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r10g10b10a2_uscaled_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t a;
         uint32_t b;
         uint32_t g;
         uint32_t r;
         a = value >> 30;
         b = (value >> 20) & 0x3ff;
         g = (value >> 10) & 0x3ff;
         r = (value) & 0x3ff;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = (float)b; /* b */
         dst[3] = (float)a; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t r;
         uint32_t g;
         uint32_t b;
         uint32_t a;
         r = (value) & 0x3ff;
         g = (value >> 10) & 0x3ff;
         b = (value >> 20) & 0x3ff;
         a = value >> 30;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = (float)b; /* b */
         dst[3] = (float)a; /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r10g10b10a2_uscaled_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)CLAMP(src[3], 0.0f, 3.0f)) << 30;
         value |= (uint32_t)(((uint32_t)CLAMP(src[2], 0.0f, 1023.0f)) & 0x3ff) << 20;
         value |= (uint32_t)(((uint32_t)CLAMP(src[1], 0.0f, 1023.0f)) & 0x3ff) << 10;
         value |= ((uint32_t)CLAMP(src[0], 0.0f, 1023.0f)) & 0x3ff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= ((uint32_t)CLAMP(src[0], 0.0f, 1023.0f)) & 0x3ff;
         value |= (uint32_t)(((uint32_t)CLAMP(src[1], 0.0f, 1023.0f)) & 0x3ff) << 10;
         value |= (uint32_t)(((uint32_t)CLAMP(src[2], 0.0f, 1023.0f)) & 0x3ff) << 20;
         value |= (uint32_t)((uint32_t)CLAMP(src[3], 0.0f, 3.0f)) << 30;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r10g10b10a2_uscaled_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t a;
         uint32_t b;
         uint32_t g;
         uint32_t r;
         a = value >> 30;
         b = (value >> 20) & 0x3ff;
         g = (value >> 10) & 0x3ff;
         r = (value) & 0x3ff;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = (float)b; /* b */
         dst[3] = (float)a; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t r;
         uint32_t g;
         uint32_t b;
         uint32_t a;
         r = (value) & 0x3ff;
         g = (value >> 10) & 0x3ff;
         b = (value >> 20) & 0x3ff;
         a = value >> 30;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = (float)b; /* b */
         dst[3] = (float)a; /* a */
#endif
}

static inline void
util_format_r10g10b10a2_uscaled_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t a;
         uint32_t b;
         uint32_t g;
         uint32_t r;
         a = value >> 30;
         b = (value >> 20) & 0x3ff;
         g = (value >> 10) & 0x3ff;
         r = (value) & 0x3ff;
         dst[0] = (uint8_t)(((uint32_t)MIN2(r, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint32_t)MIN2(g, 1)) * 0xff / 0x1); /* g */
         dst[2] = (uint8_t)(((uint32_t)MIN2(b, 1)) * 0xff / 0x1); /* b */
         dst[3] = (uint8_t)(((uint32_t)MIN2(a, 1)) * 0xff / 0x1); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t r;
         uint32_t g;
         uint32_t b;
         uint32_t a;
         r = (value) & 0x3ff;
         g = (value >> 10) & 0x3ff;
         b = (value >> 20) & 0x3ff;
         a = value >> 30;
         dst[0] = (uint8_t)(((uint32_t)MIN2(r, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint32_t)MIN2(g, 1)) * 0xff / 0x1); /* g */
         dst[2] = (uint8_t)(((uint32_t)MIN2(b, 1)) * 0xff / 0x1); /* b */
         dst[3] = (uint8_t)(((uint32_t)MIN2(a, 1)) * 0xff / 0x1); /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r10g10b10a2_uscaled_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)(((uint32_t)src[3]) * 0x1 / 0xff)) << 30;
         value |= (uint32_t)(((uint32_t)(((uint32_t)src[2]) * 0x1 / 0xff)) & 0x3ff) << 20;
         value |= (uint32_t)(((uint32_t)(((uint32_t)src[1]) * 0x1 / 0xff)) & 0x3ff) << 10;
         value |= ((uint32_t)(((uint32_t)src[0]) * 0x1 / 0xff)) & 0x3ff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= ((uint32_t)(((uint32_t)src[0]) * 0x1 / 0xff)) & 0x3ff;
         value |= (uint32_t)(((uint32_t)(((uint32_t)src[1]) * 0x1 / 0xff)) & 0x3ff) << 10;
         value |= (uint32_t)(((uint32_t)(((uint32_t)src[2]) * 0x1 / 0xff)) & 0x3ff) << 20;
         value |= (uint32_t)((uint32_t)(((uint32_t)src[3]) * 0x1 / 0xff)) << 30;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r10g10b10a2_sscaled_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t a;
         int32_t b;
         int32_t g;
         int32_t r;
         a = ((int32_t)(value) ) >> 30;
         b = ((int32_t)(value << 2) ) >> 22;
         g = ((int32_t)(value << 12) ) >> 22;
         r = ((int32_t)(value << 22) ) >> 22;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = (float)b; /* b */
         dst[3] = (float)a; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t r;
         int32_t g;
         int32_t b;
         int32_t a;
         r = ((int32_t)(value << 22) ) >> 22;
         g = ((int32_t)(value << 12) ) >> 22;
         b = ((int32_t)(value << 2) ) >> 22;
         a = ((int32_t)(value) ) >> 30;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = (float)b; /* b */
         dst[3] = (float)a; /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r10g10b10a2_sscaled_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)((uint32_t)CLAMP(src[3], -2.0f, 1.0f)) << 30) ;
         value |= (uint32_t)((uint32_t)(((uint32_t)CLAMP(src[2], -512.0f, 511.0f)) & 0x3ff) << 20) ;
         value |= (uint32_t)((uint32_t)(((uint32_t)CLAMP(src[1], -512.0f, 511.0f)) & 0x3ff) << 10) ;
         value |= (uint32_t)(((uint32_t)CLAMP(src[0], -512.0f, 511.0f)) & 0x3ff) ;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (uint32_t)(((uint32_t)CLAMP(src[0], -512.0f, 511.0f)) & 0x3ff) ;
         value |= (uint32_t)((uint32_t)(((uint32_t)CLAMP(src[1], -512.0f, 511.0f)) & 0x3ff) << 10) ;
         value |= (uint32_t)((uint32_t)(((uint32_t)CLAMP(src[2], -512.0f, 511.0f)) & 0x3ff) << 20) ;
         value |= (uint32_t)((uint32_t)((uint32_t)CLAMP(src[3], -2.0f, 1.0f)) << 30) ;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r10g10b10a2_sscaled_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t a;
         int32_t b;
         int32_t g;
         int32_t r;
         a = ((int32_t)(value) ) >> 30;
         b = ((int32_t)(value << 2) ) >> 22;
         g = ((int32_t)(value << 12) ) >> 22;
         r = ((int32_t)(value << 22) ) >> 22;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = (float)b; /* b */
         dst[3] = (float)a; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t r;
         int32_t g;
         int32_t b;
         int32_t a;
         r = ((int32_t)(value << 22) ) >> 22;
         g = ((int32_t)(value << 12) ) >> 22;
         b = ((int32_t)(value << 2) ) >> 22;
         a = ((int32_t)(value) ) >> 30;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = (float)b; /* b */
         dst[3] = (float)a; /* a */
#endif
}

static inline void
util_format_r10g10b10a2_sscaled_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t a;
         int32_t b;
         int32_t g;
         int32_t r;
         a = ((int32_t)(value) ) >> 30;
         b = ((int32_t)(value << 2) ) >> 22;
         g = ((int32_t)(value << 12) ) >> 22;
         r = ((int32_t)(value << 22) ) >> 22;
         dst[0] = (uint8_t)(((uint32_t)CLAMP(r, 0, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint32_t)CLAMP(g, 0, 1)) * 0xff / 0x1); /* g */
         dst[2] = (uint8_t)(((uint32_t)CLAMP(b, 0, 1)) * 0xff / 0x1); /* b */
         dst[3] = (uint8_t)(((uint32_t)MAX2(a, 0)) * 0xff / 0x1); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t r;
         int32_t g;
         int32_t b;
         int32_t a;
         r = ((int32_t)(value << 22) ) >> 22;
         g = ((int32_t)(value << 12) ) >> 22;
         b = ((int32_t)(value << 2) ) >> 22;
         a = ((int32_t)(value) ) >> 30;
         dst[0] = (uint8_t)(((uint32_t)CLAMP(r, 0, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint32_t)CLAMP(g, 0, 1)) * 0xff / 0x1); /* g */
         dst[2] = (uint8_t)(((uint32_t)CLAMP(b, 0, 1)) * 0xff / 0x1); /* b */
         dst[3] = (uint8_t)(((uint32_t)MAX2(a, 0)) * 0xff / 0x1); /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r10g10b10a2_sscaled_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)((uint32_t)(((uint32_t)src[3]) * 0x1 / 0xff)) << 30) ;
         value |= (uint32_t)((uint32_t)(((uint32_t)(((uint32_t)src[2]) * 0x1 / 0xff)) & 0x3ff) << 20) ;
         value |= (uint32_t)((uint32_t)(((uint32_t)(((uint32_t)src[1]) * 0x1 / 0xff)) & 0x3ff) << 10) ;
         value |= (uint32_t)(((uint32_t)(((uint32_t)src[0]) * 0x1 / 0xff)) & 0x3ff) ;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (uint32_t)(((uint32_t)(((uint32_t)src[0]) * 0x1 / 0xff)) & 0x3ff) ;
         value |= (uint32_t)((uint32_t)(((uint32_t)(((uint32_t)src[1]) * 0x1 / 0xff)) & 0x3ff) << 10) ;
         value |= (uint32_t)((uint32_t)(((uint32_t)(((uint32_t)src[2]) * 0x1 / 0xff)) & 0x3ff) << 20) ;
         value |= (uint32_t)((uint32_t)((uint32_t)(((uint32_t)src[3]) * 0x1 / 0xff)) << 30) ;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r10g10b10a2_snorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t a;
         int32_t b;
         int32_t g;
         int32_t r;
         a = ((int32_t)(value) ) >> 30;
         b = ((int32_t)(value << 2) ) >> 22;
         g = ((int32_t)(value << 12) ) >> 22;
         r = ((int32_t)(value << 22) ) >> 22;
         dst[0] = (float)(r * (1.0f/0x1ff)); /* r */
         dst[1] = (float)(g * (1.0f/0x1ff)); /* g */
         dst[2] = (float)(b * (1.0f/0x1ff)); /* b */
         dst[3] = (float)(a * (1.0f/0x1)); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t r;
         int32_t g;
         int32_t b;
         int32_t a;
         r = ((int32_t)(value << 22) ) >> 22;
         g = ((int32_t)(value << 12) ) >> 22;
         b = ((int32_t)(value << 2) ) >> 22;
         a = ((int32_t)(value) ) >> 30;
         dst[0] = (float)(r * (1.0f/0x1ff)); /* r */
         dst[1] = (float)(g * (1.0f/0x1ff)); /* g */
         dst[2] = (float)(b * (1.0f/0x1ff)); /* b */
         dst[3] = (float)(a * (1.0f/0x1)); /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r10g10b10a2_snorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)((uint32_t)util_iround(CLAMP(src[3], -1.0f, 1.0f) * 0x1)) << 30) ;
         value |= (uint32_t)((uint32_t)(((uint32_t)util_iround(CLAMP(src[2], -1.0f, 1.0f) * 0x1ff)) & 0x3ff) << 20) ;
         value |= (uint32_t)((uint32_t)(((uint32_t)util_iround(CLAMP(src[1], -1.0f, 1.0f) * 0x1ff)) & 0x3ff) << 10) ;
         value |= (uint32_t)(((uint32_t)util_iround(CLAMP(src[0], -1.0f, 1.0f) * 0x1ff)) & 0x3ff) ;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (uint32_t)(((uint32_t)util_iround(CLAMP(src[0], -1.0f, 1.0f) * 0x1ff)) & 0x3ff) ;
         value |= (uint32_t)((uint32_t)(((uint32_t)util_iround(CLAMP(src[1], -1.0f, 1.0f) * 0x1ff)) & 0x3ff) << 10) ;
         value |= (uint32_t)((uint32_t)(((uint32_t)util_iround(CLAMP(src[2], -1.0f, 1.0f) * 0x1ff)) & 0x3ff) << 20) ;
         value |= (uint32_t)((uint32_t)((uint32_t)util_iround(CLAMP(src[3], -1.0f, 1.0f) * 0x1)) << 30) ;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r10g10b10a2_snorm_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t a;
         int32_t b;
         int32_t g;
         int32_t r;
         a = ((int32_t)(value) ) >> 30;
         b = ((int32_t)(value << 2) ) >> 22;
         g = ((int32_t)(value << 12) ) >> 22;
         r = ((int32_t)(value << 22) ) >> 22;
         dst[0] = (float)(r * (1.0f/0x1ff)); /* r */
         dst[1] = (float)(g * (1.0f/0x1ff)); /* g */
         dst[2] = (float)(b * (1.0f/0x1ff)); /* b */
         dst[3] = (float)(a * (1.0f/0x1)); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t r;
         int32_t g;
         int32_t b;
         int32_t a;
         r = ((int32_t)(value << 22) ) >> 22;
         g = ((int32_t)(value << 12) ) >> 22;
         b = ((int32_t)(value << 2) ) >> 22;
         a = ((int32_t)(value) ) >> 30;
         dst[0] = (float)(r * (1.0f/0x1ff)); /* r */
         dst[1] = (float)(g * (1.0f/0x1ff)); /* g */
         dst[2] = (float)(b * (1.0f/0x1ff)); /* b */
         dst[3] = (float)(a * (1.0f/0x1)); /* a */
#endif
}

static inline void
util_format_r10g10b10a2_snorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t a;
         int32_t b;
         int32_t g;
         int32_t r;
         a = ((int32_t)(value) ) >> 30;
         b = ((int32_t)(value << 2) ) >> 22;
         g = ((int32_t)(value << 12) ) >> 22;
         r = ((int32_t)(value << 22) ) >> 22;
         dst[0] = (uint8_t)(MAX2(r, 0) >> 1); /* r */
         dst[1] = (uint8_t)(MAX2(g, 0) >> 1); /* g */
         dst[2] = (uint8_t)(MAX2(b, 0) >> 1); /* b */
         dst[3] = (uint8_t)(((uint32_t)MAX2(a, 0)) * 0xff / 0x1); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t r;
         int32_t g;
         int32_t b;
         int32_t a;
         r = ((int32_t)(value << 22) ) >> 22;
         g = ((int32_t)(value << 12) ) >> 22;
         b = ((int32_t)(value << 2) ) >> 22;
         a = ((int32_t)(value) ) >> 30;
         dst[0] = (uint8_t)(MAX2(r, 0) >> 1); /* r */
         dst[1] = (uint8_t)(MAX2(g, 0) >> 1); /* g */
         dst[2] = (uint8_t)(MAX2(b, 0) >> 1); /* b */
         dst[3] = (uint8_t)(((uint32_t)MAX2(a, 0)) * 0xff / 0x1); /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r10g10b10a2_snorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)((uint32_t)(src[3] >> 7)) << 30) ;
         value |= (uint32_t)((uint32_t)(((uint32_t)(((uint32_t)src[2]) * 0x1ff / 0xff)) & 0x3ff) << 20) ;
         value |= (uint32_t)((uint32_t)(((uint32_t)(((uint32_t)src[1]) * 0x1ff / 0xff)) & 0x3ff) << 10) ;
         value |= (uint32_t)(((uint32_t)(((uint32_t)src[0]) * 0x1ff / 0xff)) & 0x3ff) ;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (uint32_t)(((uint32_t)(((uint32_t)src[0]) * 0x1ff / 0xff)) & 0x3ff) ;
         value |= (uint32_t)((uint32_t)(((uint32_t)(((uint32_t)src[1]) * 0x1ff / 0xff)) & 0x3ff) << 10) ;
         value |= (uint32_t)((uint32_t)(((uint32_t)(((uint32_t)src[2]) * 0x1ff / 0xff)) & 0x3ff) << 20) ;
         value |= (uint32_t)((uint32_t)((uint32_t)(src[3] >> 7)) << 30) ;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_b10g10r10a2_uscaled_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t a;
         uint32_t r;
         uint32_t g;
         uint32_t b;
         a = value >> 30;
         r = (value >> 20) & 0x3ff;
         g = (value >> 10) & 0x3ff;
         b = (value) & 0x3ff;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = (float)b; /* b */
         dst[3] = (float)a; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t b;
         uint32_t g;
         uint32_t r;
         uint32_t a;
         b = (value) & 0x3ff;
         g = (value >> 10) & 0x3ff;
         r = (value >> 20) & 0x3ff;
         a = value >> 30;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = (float)b; /* b */
         dst[3] = (float)a; /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_b10g10r10a2_uscaled_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)CLAMP(src[3], 0.0f, 3.0f)) << 30;
         value |= (uint32_t)(((uint32_t)CLAMP(src[0], 0.0f, 1023.0f)) & 0x3ff) << 20;
         value |= (uint32_t)(((uint32_t)CLAMP(src[1], 0.0f, 1023.0f)) & 0x3ff) << 10;
         value |= ((uint32_t)CLAMP(src[2], 0.0f, 1023.0f)) & 0x3ff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= ((uint32_t)CLAMP(src[2], 0.0f, 1023.0f)) & 0x3ff;
         value |= (uint32_t)(((uint32_t)CLAMP(src[1], 0.0f, 1023.0f)) & 0x3ff) << 10;
         value |= (uint32_t)(((uint32_t)CLAMP(src[0], 0.0f, 1023.0f)) & 0x3ff) << 20;
         value |= (uint32_t)((uint32_t)CLAMP(src[3], 0.0f, 3.0f)) << 30;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_b10g10r10a2_uscaled_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t a;
         uint32_t r;
         uint32_t g;
         uint32_t b;
         a = value >> 30;
         r = (value >> 20) & 0x3ff;
         g = (value >> 10) & 0x3ff;
         b = (value) & 0x3ff;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = (float)b; /* b */
         dst[3] = (float)a; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t b;
         uint32_t g;
         uint32_t r;
         uint32_t a;
         b = (value) & 0x3ff;
         g = (value >> 10) & 0x3ff;
         r = (value >> 20) & 0x3ff;
         a = value >> 30;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = (float)b; /* b */
         dst[3] = (float)a; /* a */
#endif
}

static inline void
util_format_b10g10r10a2_uscaled_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t a;
         uint32_t r;
         uint32_t g;
         uint32_t b;
         a = value >> 30;
         r = (value >> 20) & 0x3ff;
         g = (value >> 10) & 0x3ff;
         b = (value) & 0x3ff;
         dst[0] = (uint8_t)(((uint32_t)MIN2(r, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint32_t)MIN2(g, 1)) * 0xff / 0x1); /* g */
         dst[2] = (uint8_t)(((uint32_t)MIN2(b, 1)) * 0xff / 0x1); /* b */
         dst[3] = (uint8_t)(((uint32_t)MIN2(a, 1)) * 0xff / 0x1); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t b;
         uint32_t g;
         uint32_t r;
         uint32_t a;
         b = (value) & 0x3ff;
         g = (value >> 10) & 0x3ff;
         r = (value >> 20) & 0x3ff;
         a = value >> 30;
         dst[0] = (uint8_t)(((uint32_t)MIN2(r, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint32_t)MIN2(g, 1)) * 0xff / 0x1); /* g */
         dst[2] = (uint8_t)(((uint32_t)MIN2(b, 1)) * 0xff / 0x1); /* b */
         dst[3] = (uint8_t)(((uint32_t)MIN2(a, 1)) * 0xff / 0x1); /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_b10g10r10a2_uscaled_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)(((uint32_t)src[3]) * 0x1 / 0xff)) << 30;
         value |= (uint32_t)(((uint32_t)(((uint32_t)src[0]) * 0x1 / 0xff)) & 0x3ff) << 20;
         value |= (uint32_t)(((uint32_t)(((uint32_t)src[1]) * 0x1 / 0xff)) & 0x3ff) << 10;
         value |= ((uint32_t)(((uint32_t)src[2]) * 0x1 / 0xff)) & 0x3ff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= ((uint32_t)(((uint32_t)src[2]) * 0x1 / 0xff)) & 0x3ff;
         value |= (uint32_t)(((uint32_t)(((uint32_t)src[1]) * 0x1 / 0xff)) & 0x3ff) << 10;
         value |= (uint32_t)(((uint32_t)(((uint32_t)src[0]) * 0x1 / 0xff)) & 0x3ff) << 20;
         value |= (uint32_t)((uint32_t)(((uint32_t)src[3]) * 0x1 / 0xff)) << 30;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_b10g10r10a2_sscaled_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t a;
         int32_t r;
         int32_t g;
         int32_t b;
         a = ((int32_t)(value) ) >> 30;
         r = ((int32_t)(value << 2) ) >> 22;
         g = ((int32_t)(value << 12) ) >> 22;
         b = ((int32_t)(value << 22) ) >> 22;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = (float)b; /* b */
         dst[3] = (float)a; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t b;
         int32_t g;
         int32_t r;
         int32_t a;
         b = ((int32_t)(value << 22) ) >> 22;
         g = ((int32_t)(value << 12) ) >> 22;
         r = ((int32_t)(value << 2) ) >> 22;
         a = ((int32_t)(value) ) >> 30;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = (float)b; /* b */
         dst[3] = (float)a; /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_b10g10r10a2_sscaled_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)((uint32_t)CLAMP(src[3], -2.0f, 1.0f)) << 30) ;
         value |= (uint32_t)((uint32_t)(((uint32_t)CLAMP(src[0], -512.0f, 511.0f)) & 0x3ff) << 20) ;
         value |= (uint32_t)((uint32_t)(((uint32_t)CLAMP(src[1], -512.0f, 511.0f)) & 0x3ff) << 10) ;
         value |= (uint32_t)(((uint32_t)CLAMP(src[2], -512.0f, 511.0f)) & 0x3ff) ;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (uint32_t)(((uint32_t)CLAMP(src[2], -512.0f, 511.0f)) & 0x3ff) ;
         value |= (uint32_t)((uint32_t)(((uint32_t)CLAMP(src[1], -512.0f, 511.0f)) & 0x3ff) << 10) ;
         value |= (uint32_t)((uint32_t)(((uint32_t)CLAMP(src[0], -512.0f, 511.0f)) & 0x3ff) << 20) ;
         value |= (uint32_t)((uint32_t)((uint32_t)CLAMP(src[3], -2.0f, 1.0f)) << 30) ;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_b10g10r10a2_sscaled_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t a;
         int32_t r;
         int32_t g;
         int32_t b;
         a = ((int32_t)(value) ) >> 30;
         r = ((int32_t)(value << 2) ) >> 22;
         g = ((int32_t)(value << 12) ) >> 22;
         b = ((int32_t)(value << 22) ) >> 22;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = (float)b; /* b */
         dst[3] = (float)a; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t b;
         int32_t g;
         int32_t r;
         int32_t a;
         b = ((int32_t)(value << 22) ) >> 22;
         g = ((int32_t)(value << 12) ) >> 22;
         r = ((int32_t)(value << 2) ) >> 22;
         a = ((int32_t)(value) ) >> 30;
         dst[0] = (float)r; /* r */
         dst[1] = (float)g; /* g */
         dst[2] = (float)b; /* b */
         dst[3] = (float)a; /* a */
#endif
}

static inline void
util_format_b10g10r10a2_sscaled_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t a;
         int32_t r;
         int32_t g;
         int32_t b;
         a = ((int32_t)(value) ) >> 30;
         r = ((int32_t)(value << 2) ) >> 22;
         g = ((int32_t)(value << 12) ) >> 22;
         b = ((int32_t)(value << 22) ) >> 22;
         dst[0] = (uint8_t)(((uint32_t)CLAMP(r, 0, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint32_t)CLAMP(g, 0, 1)) * 0xff / 0x1); /* g */
         dst[2] = (uint8_t)(((uint32_t)CLAMP(b, 0, 1)) * 0xff / 0x1); /* b */
         dst[3] = (uint8_t)(((uint32_t)MAX2(a, 0)) * 0xff / 0x1); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t b;
         int32_t g;
         int32_t r;
         int32_t a;
         b = ((int32_t)(value << 22) ) >> 22;
         g = ((int32_t)(value << 12) ) >> 22;
         r = ((int32_t)(value << 2) ) >> 22;
         a = ((int32_t)(value) ) >> 30;
         dst[0] = (uint8_t)(((uint32_t)CLAMP(r, 0, 1)) * 0xff / 0x1); /* r */
         dst[1] = (uint8_t)(((uint32_t)CLAMP(g, 0, 1)) * 0xff / 0x1); /* g */
         dst[2] = (uint8_t)(((uint32_t)CLAMP(b, 0, 1)) * 0xff / 0x1); /* b */
         dst[3] = (uint8_t)(((uint32_t)MAX2(a, 0)) * 0xff / 0x1); /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_b10g10r10a2_sscaled_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)((uint32_t)(((uint32_t)src[3]) * 0x1 / 0xff)) << 30) ;
         value |= (uint32_t)((uint32_t)(((uint32_t)(((uint32_t)src[0]) * 0x1 / 0xff)) & 0x3ff) << 20) ;
         value |= (uint32_t)((uint32_t)(((uint32_t)(((uint32_t)src[1]) * 0x1 / 0xff)) & 0x3ff) << 10) ;
         value |= (uint32_t)(((uint32_t)(((uint32_t)src[2]) * 0x1 / 0xff)) & 0x3ff) ;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (uint32_t)(((uint32_t)(((uint32_t)src[2]) * 0x1 / 0xff)) & 0x3ff) ;
         value |= (uint32_t)((uint32_t)(((uint32_t)(((uint32_t)src[1]) * 0x1 / 0xff)) & 0x3ff) << 10) ;
         value |= (uint32_t)((uint32_t)(((uint32_t)(((uint32_t)src[0]) * 0x1 / 0xff)) & 0x3ff) << 20) ;
         value |= (uint32_t)((uint32_t)((uint32_t)(((uint32_t)src[3]) * 0x1 / 0xff)) << 30) ;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_b10g10r10a2_snorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      float *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t a;
         int32_t r;
         int32_t g;
         int32_t b;
         a = ((int32_t)(value) ) >> 30;
         r = ((int32_t)(value << 2) ) >> 22;
         g = ((int32_t)(value << 12) ) >> 22;
         b = ((int32_t)(value << 22) ) >> 22;
         dst[0] = (float)(r * (1.0f/0x1ff)); /* r */
         dst[1] = (float)(g * (1.0f/0x1ff)); /* g */
         dst[2] = (float)(b * (1.0f/0x1ff)); /* b */
         dst[3] = (float)(a * (1.0f/0x1)); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t b;
         int32_t g;
         int32_t r;
         int32_t a;
         b = ((int32_t)(value << 22) ) >> 22;
         g = ((int32_t)(value << 12) ) >> 22;
         r = ((int32_t)(value << 2) ) >> 22;
         a = ((int32_t)(value) ) >> 30;
         dst[0] = (float)(r * (1.0f/0x1ff)); /* r */
         dst[1] = (float)(g * (1.0f/0x1ff)); /* g */
         dst[2] = (float)(b * (1.0f/0x1ff)); /* b */
         dst[3] = (float)(a * (1.0f/0x1)); /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_b10g10r10a2_snorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, const float *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const float *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)((uint32_t)util_iround(CLAMP(src[3], -1.0f, 1.0f) * 0x1)) << 30) ;
         value |= (uint32_t)((uint32_t)(((uint32_t)util_iround(CLAMP(src[0], -1.0f, 1.0f) * 0x1ff)) & 0x3ff) << 20) ;
         value |= (uint32_t)((uint32_t)(((uint32_t)util_iround(CLAMP(src[1], -1.0f, 1.0f) * 0x1ff)) & 0x3ff) << 10) ;
         value |= (uint32_t)(((uint32_t)util_iround(CLAMP(src[2], -1.0f, 1.0f) * 0x1ff)) & 0x3ff) ;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (uint32_t)(((uint32_t)util_iround(CLAMP(src[2], -1.0f, 1.0f) * 0x1ff)) & 0x3ff) ;
         value |= (uint32_t)((uint32_t)(((uint32_t)util_iround(CLAMP(src[1], -1.0f, 1.0f) * 0x1ff)) & 0x3ff) << 10) ;
         value |= (uint32_t)((uint32_t)(((uint32_t)util_iround(CLAMP(src[0], -1.0f, 1.0f) * 0x1ff)) & 0x3ff) << 20) ;
         value |= (uint32_t)((uint32_t)((uint32_t)util_iround(CLAMP(src[3], -1.0f, 1.0f) * 0x1)) << 30) ;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_b10g10r10a2_snorm_fetch_rgba_float(float *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t a;
         int32_t r;
         int32_t g;
         int32_t b;
         a = ((int32_t)(value) ) >> 30;
         r = ((int32_t)(value << 2) ) >> 22;
         g = ((int32_t)(value << 12) ) >> 22;
         b = ((int32_t)(value << 22) ) >> 22;
         dst[0] = (float)(r * (1.0f/0x1ff)); /* r */
         dst[1] = (float)(g * (1.0f/0x1ff)); /* g */
         dst[2] = (float)(b * (1.0f/0x1ff)); /* b */
         dst[3] = (float)(a * (1.0f/0x1)); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t b;
         int32_t g;
         int32_t r;
         int32_t a;
         b = ((int32_t)(value << 22) ) >> 22;
         g = ((int32_t)(value << 12) ) >> 22;
         r = ((int32_t)(value << 2) ) >> 22;
         a = ((int32_t)(value) ) >> 30;
         dst[0] = (float)(r * (1.0f/0x1ff)); /* r */
         dst[1] = (float)(g * (1.0f/0x1ff)); /* g */
         dst[2] = (float)(b * (1.0f/0x1ff)); /* b */
         dst[3] = (float)(a * (1.0f/0x1)); /* a */
#endif
}

static inline void
util_format_b10g10r10a2_snorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      uint8_t *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t a;
         int32_t r;
         int32_t g;
         int32_t b;
         a = ((int32_t)(value) ) >> 30;
         r = ((int32_t)(value << 2) ) >> 22;
         g = ((int32_t)(value << 12) ) >> 22;
         b = ((int32_t)(value << 22) ) >> 22;
         dst[0] = (uint8_t)(MAX2(r, 0) >> 1); /* r */
         dst[1] = (uint8_t)(MAX2(g, 0) >> 1); /* g */
         dst[2] = (uint8_t)(MAX2(b, 0) >> 1); /* b */
         dst[3] = (uint8_t)(((uint32_t)MAX2(a, 0)) * 0xff / 0x1); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t b;
         int32_t g;
         int32_t r;
         int32_t a;
         b = ((int32_t)(value << 22) ) >> 22;
         g = ((int32_t)(value << 12) ) >> 22;
         r = ((int32_t)(value << 2) ) >> 22;
         a = ((int32_t)(value) ) >> 30;
         dst[0] = (uint8_t)(MAX2(r, 0) >> 1); /* r */
         dst[1] = (uint8_t)(MAX2(g, 0) >> 1); /* g */
         dst[2] = (uint8_t)(MAX2(b, 0) >> 1); /* b */
         dst[3] = (uint8_t)(((uint32_t)MAX2(a, 0)) * 0xff / 0x1); /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_b10g10r10a2_snorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const uint8_t *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)((uint32_t)(src[3] >> 7)) << 30) ;
         value |= (uint32_t)((uint32_t)(((uint32_t)(((uint32_t)src[0]) * 0x1ff / 0xff)) & 0x3ff) << 20) ;
         value |= (uint32_t)((uint32_t)(((uint32_t)(((uint32_t)src[1]) * 0x1ff / 0xff)) & 0x3ff) << 10) ;
         value |= (uint32_t)(((uint32_t)(((uint32_t)src[2]) * 0x1ff / 0xff)) & 0x3ff) ;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (uint32_t)(((uint32_t)(((uint32_t)src[2]) * 0x1ff / 0xff)) & 0x3ff) ;
         value |= (uint32_t)((uint32_t)(((uint32_t)(((uint32_t)src[1]) * 0x1ff / 0xff)) & 0x3ff) << 10) ;
         value |= (uint32_t)((uint32_t)(((uint32_t)(((uint32_t)src[0]) * 0x1ff / 0xff)) & 0x3ff) << 20) ;
         value |= (uint32_t)((uint32_t)((uint32_t)(src[3] >> 7)) << 30) ;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r8_uint_unpack_unsigned(unsigned *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      unsigned *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = *(const uint8_t *)src;
         uint8_t r;
         r = value;
         dst[0] = (unsigned)r; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
         src += 1;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r8_uint_pack_unsigned(uint8_t *dst_row, unsigned dst_stride, const unsigned *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = 0;
         value |= (uint8_t)MIN2(src[0], 255);
         *(uint8_t *)dst = value;
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r8_uint_fetch_unsigned(unsigned *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
         uint8_t value = *(const uint8_t *)src;
         uint8_t r;
         r = value;
         dst[0] = (unsigned)r; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
}

static inline void
util_format_r8_uint_unpack_signed(int *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      int *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = *(const uint8_t *)src;
         uint8_t r;
         r = value;
         dst[0] = (int)r; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
         src += 1;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r8_uint_pack_signed(uint8_t *dst_row, unsigned dst_stride, const int *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = 0;
         value |= (uint8_t)CLAMP(src[0], 0, 255);
         *(uint8_t *)dst = value;
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r8g8_uint_unpack_unsigned(unsigned *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      unsigned *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t r;
         uint16_t g;
         r = value >> 8;
         g = (value) & 0xff;
         dst[0] = (unsigned)r; /* r */
         dst[1] = (unsigned)g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t r;
         uint16_t g;
         r = (value) & 0xff;
         g = value >> 8;
         dst[0] = (unsigned)r; /* r */
         dst[1] = (unsigned)g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#endif
         src += 2;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r8g8_uint_pack_unsigned(uint8_t *dst_row, unsigned dst_stride, const unsigned *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)((uint8_t)MIN2(src[0], 255)) << 8;
         value |= ((uint8_t)MIN2(src[1], 255)) & 0xff;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= ((uint8_t)MIN2(src[0], 255)) & 0xff;
         value |= (uint32_t)((uint8_t)MIN2(src[1], 255)) << 8;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r8g8_uint_fetch_unsigned(unsigned *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t r;
         uint16_t g;
         r = value >> 8;
         g = (value) & 0xff;
         dst[0] = (unsigned)r; /* r */
         dst[1] = (unsigned)g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t r;
         uint16_t g;
         r = (value) & 0xff;
         g = value >> 8;
         dst[0] = (unsigned)r; /* r */
         dst[1] = (unsigned)g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#endif
}

static inline void
util_format_r8g8_uint_unpack_signed(int *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      int *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t r;
         uint16_t g;
         r = value >> 8;
         g = (value) & 0xff;
         dst[0] = (int)r; /* r */
         dst[1] = (int)g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t r;
         uint16_t g;
         r = (value) & 0xff;
         g = value >> 8;
         dst[0] = (int)r; /* r */
         dst[1] = (int)g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#endif
         src += 2;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r8g8_uint_pack_signed(uint8_t *dst_row, unsigned dst_stride, const int *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)((uint8_t)CLAMP(src[0], 0, 255)) << 8;
         value |= ((uint8_t)CLAMP(src[1], 0, 255)) & 0xff;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= ((uint8_t)CLAMP(src[0], 0, 255)) & 0xff;
         value |= (uint32_t)((uint8_t)CLAMP(src[1], 0, 255)) << 8;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r8g8b8_uint {
#if UTIL_ARCH_BIG_ENDIAN
   uint8_t r;
   uint8_t g;
   uint8_t b;
#else
   uint8_t r;
   uint8_t g;
   uint8_t b;
#endif
};

static inline void
util_format_r8g8b8_uint_unpack_unsigned(unsigned *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      unsigned *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r8g8b8_uint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (unsigned)pixel.r; /* r */
         dst[1] = (unsigned)pixel.g; /* g */
         dst[2] = (unsigned)pixel.b; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r8g8b8_uint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (unsigned)pixel.r; /* r */
         dst[1] = (unsigned)pixel.g; /* g */
         dst[2] = (unsigned)pixel.b; /* b */
         dst[3] = 1; /* a */
#endif
         src += 3;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r8g8b8_uint_pack_unsigned(uint8_t *dst_row, unsigned dst_stride, const unsigned *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r8g8b8_uint pixel;
         pixel.r = (uint8_t)MIN2(src[0], 255);
         pixel.g = (uint8_t)MIN2(src[1], 255);
         pixel.b = (uint8_t)MIN2(src[2], 255);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r8g8b8_uint pixel;
         pixel.r = (uint8_t)MIN2(src[0], 255);
         pixel.g = (uint8_t)MIN2(src[1], 255);
         pixel.b = (uint8_t)MIN2(src[2], 255);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 3;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r8g8b8_uint_fetch_unsigned(unsigned *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r8g8b8_uint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (unsigned)pixel.r; /* r */
         dst[1] = (unsigned)pixel.g; /* g */
         dst[2] = (unsigned)pixel.b; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r8g8b8_uint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (unsigned)pixel.r; /* r */
         dst[1] = (unsigned)pixel.g; /* g */
         dst[2] = (unsigned)pixel.b; /* b */
         dst[3] = 1; /* a */
#endif
}

static inline void
util_format_r8g8b8_uint_unpack_signed(int *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      int *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r8g8b8_uint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (int)pixel.r; /* r */
         dst[1] = (int)pixel.g; /* g */
         dst[2] = (int)pixel.b; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r8g8b8_uint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (int)pixel.r; /* r */
         dst[1] = (int)pixel.g; /* g */
         dst[2] = (int)pixel.b; /* b */
         dst[3] = 1; /* a */
#endif
         src += 3;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r8g8b8_uint_pack_signed(uint8_t *dst_row, unsigned dst_stride, const int *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r8g8b8_uint pixel;
         pixel.r = (uint8_t)CLAMP(src[0], 0, 255);
         pixel.g = (uint8_t)CLAMP(src[1], 0, 255);
         pixel.b = (uint8_t)CLAMP(src[2], 0, 255);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r8g8b8_uint pixel;
         pixel.r = (uint8_t)CLAMP(src[0], 0, 255);
         pixel.g = (uint8_t)CLAMP(src[1], 0, 255);
         pixel.b = (uint8_t)CLAMP(src[2], 0, 255);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 3;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r8g8b8a8_uint_unpack_unsigned(unsigned *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      unsigned *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t r;
         uint32_t g;
         uint32_t b;
         uint32_t a;
         r = value >> 24;
         g = (value >> 16) & 0xff;
         b = (value >> 8) & 0xff;
         a = (value) & 0xff;
         dst[0] = (unsigned)r; /* r */
         dst[1] = (unsigned)g; /* g */
         dst[2] = (unsigned)b; /* b */
         dst[3] = (unsigned)a; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t r;
         uint32_t g;
         uint32_t b;
         uint32_t a;
         r = (value) & 0xff;
         g = (value >> 8) & 0xff;
         b = (value >> 16) & 0xff;
         a = value >> 24;
         dst[0] = (unsigned)r; /* r */
         dst[1] = (unsigned)g; /* g */
         dst[2] = (unsigned)b; /* b */
         dst[3] = (unsigned)a; /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r8g8b8a8_uint_pack_unsigned(uint8_t *dst_row, unsigned dst_stride, const unsigned *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint8_t)MIN2(src[0], 255)) << 24;
         value |= (uint32_t)(((uint8_t)MIN2(src[1], 255)) & 0xff) << 16;
         value |= (uint32_t)(((uint8_t)MIN2(src[2], 255)) & 0xff) << 8;
         value |= ((uint8_t)MIN2(src[3], 255)) & 0xff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= ((uint8_t)MIN2(src[0], 255)) & 0xff;
         value |= (uint32_t)(((uint8_t)MIN2(src[1], 255)) & 0xff) << 8;
         value |= (uint32_t)(((uint8_t)MIN2(src[2], 255)) & 0xff) << 16;
         value |= (uint32_t)((uint8_t)MIN2(src[3], 255)) << 24;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r8g8b8a8_uint_fetch_unsigned(unsigned *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t r;
         uint32_t g;
         uint32_t b;
         uint32_t a;
         r = value >> 24;
         g = (value >> 16) & 0xff;
         b = (value >> 8) & 0xff;
         a = (value) & 0xff;
         dst[0] = (unsigned)r; /* r */
         dst[1] = (unsigned)g; /* g */
         dst[2] = (unsigned)b; /* b */
         dst[3] = (unsigned)a; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t r;
         uint32_t g;
         uint32_t b;
         uint32_t a;
         r = (value) & 0xff;
         g = (value >> 8) & 0xff;
         b = (value >> 16) & 0xff;
         a = value >> 24;
         dst[0] = (unsigned)r; /* r */
         dst[1] = (unsigned)g; /* g */
         dst[2] = (unsigned)b; /* b */
         dst[3] = (unsigned)a; /* a */
#endif
}

static inline void
util_format_r8g8b8a8_uint_unpack_signed(int *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      int *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t r;
         uint32_t g;
         uint32_t b;
         uint32_t a;
         r = value >> 24;
         g = (value >> 16) & 0xff;
         b = (value >> 8) & 0xff;
         a = (value) & 0xff;
         dst[0] = (int)r; /* r */
         dst[1] = (int)g; /* g */
         dst[2] = (int)b; /* b */
         dst[3] = (int)a; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t r;
         uint32_t g;
         uint32_t b;
         uint32_t a;
         r = (value) & 0xff;
         g = (value >> 8) & 0xff;
         b = (value >> 16) & 0xff;
         a = value >> 24;
         dst[0] = (int)r; /* r */
         dst[1] = (int)g; /* g */
         dst[2] = (int)b; /* b */
         dst[3] = (int)a; /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r8g8b8a8_uint_pack_signed(uint8_t *dst_row, unsigned dst_stride, const int *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint8_t)CLAMP(src[0], 0, 255)) << 24;
         value |= (uint32_t)(((uint8_t)CLAMP(src[1], 0, 255)) & 0xff) << 16;
         value |= (uint32_t)(((uint8_t)CLAMP(src[2], 0, 255)) & 0xff) << 8;
         value |= ((uint8_t)CLAMP(src[3], 0, 255)) & 0xff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= ((uint8_t)CLAMP(src[0], 0, 255)) & 0xff;
         value |= (uint32_t)(((uint8_t)CLAMP(src[1], 0, 255)) & 0xff) << 8;
         value |= (uint32_t)(((uint8_t)CLAMP(src[2], 0, 255)) & 0xff) << 16;
         value |= (uint32_t)((uint8_t)CLAMP(src[3], 0, 255)) << 24;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r8_sint_unpack_signed(int *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      int *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = *(const uint8_t *)src;
         int8_t r;
         r = (int8_t)(value) ;
         dst[0] = (int)r; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
         src += 1;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r8_sint_pack_signed(uint8_t *dst_row, unsigned dst_stride, const int *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = 0;
         value |= (uint8_t)((int8_t)CLAMP(src[0], -128, 127)) ;
         *(uint8_t *)dst = value;
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r8_sint_fetch_signed(int *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
         uint8_t value = *(const uint8_t *)src;
         int8_t r;
         r = (int8_t)(value) ;
         dst[0] = (int)r; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
}

static inline void
util_format_r8_sint_unpack_unsigned(unsigned *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      unsigned *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = *(const uint8_t *)src;
         int8_t r;
         r = (int8_t)(value) ;
         dst[0] = (unsigned)MAX2(r, 0); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
         src += 1;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r8_sint_pack_unsigned(uint8_t *dst_row, unsigned dst_stride, const unsigned *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = 0;
         value |= (uint8_t)((int8_t)MIN2(src[0], 127)) ;
         *(uint8_t *)dst = value;
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r8g8_sint_unpack_signed(int *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      int *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         int16_t r;
         int16_t g;
         r = ((int16_t)(value) ) >> 8;
         g = ((int16_t)(value << 8) ) >> 8;
         dst[0] = (int)r; /* r */
         dst[1] = (int)g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         int16_t r;
         int16_t g;
         r = ((int16_t)(value << 8) ) >> 8;
         g = ((int16_t)(value) ) >> 8;
         dst[0] = (int)r; /* r */
         dst[1] = (int)g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#endif
         src += 2;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r8g8_sint_pack_signed(uint8_t *dst_row, unsigned dst_stride, const int *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint16_t)((uint32_t)((int8_t)CLAMP(src[0], -128, 127)) << 8) ;
         value |= (uint16_t)(((int8_t)CLAMP(src[1], -128, 127)) & 0xff) ;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= (uint16_t)(((int8_t)CLAMP(src[0], -128, 127)) & 0xff) ;
         value |= (uint16_t)((uint32_t)((int8_t)CLAMP(src[1], -128, 127)) << 8) ;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r8g8_sint_fetch_signed(int *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         int16_t r;
         int16_t g;
         r = ((int16_t)(value) ) >> 8;
         g = ((int16_t)(value << 8) ) >> 8;
         dst[0] = (int)r; /* r */
         dst[1] = (int)g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         int16_t r;
         int16_t g;
         r = ((int16_t)(value << 8) ) >> 8;
         g = ((int16_t)(value) ) >> 8;
         dst[0] = (int)r; /* r */
         dst[1] = (int)g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#endif
}

static inline void
util_format_r8g8_sint_unpack_unsigned(unsigned *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      unsigned *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         int16_t r;
         int16_t g;
         r = ((int16_t)(value) ) >> 8;
         g = ((int16_t)(value << 8) ) >> 8;
         dst[0] = (unsigned)MAX2(r, 0); /* r */
         dst[1] = (unsigned)MAX2(g, 0); /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         int16_t r;
         int16_t g;
         r = ((int16_t)(value << 8) ) >> 8;
         g = ((int16_t)(value) ) >> 8;
         dst[0] = (unsigned)MAX2(r, 0); /* r */
         dst[1] = (unsigned)MAX2(g, 0); /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#endif
         src += 2;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r8g8_sint_pack_unsigned(uint8_t *dst_row, unsigned dst_stride, const unsigned *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint16_t)((uint32_t)((int8_t)MIN2(src[0], 127)) << 8) ;
         value |= (uint16_t)(((int8_t)MIN2(src[1], 127)) & 0xff) ;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= (uint16_t)(((int8_t)MIN2(src[0], 127)) & 0xff) ;
         value |= (uint16_t)((uint32_t)((int8_t)MIN2(src[1], 127)) << 8) ;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r8g8b8_sint {
#if UTIL_ARCH_BIG_ENDIAN
   int8_t r;
   int8_t g;
   int8_t b;
#else
   int8_t r;
   int8_t g;
   int8_t b;
#endif
};

static inline void
util_format_r8g8b8_sint_unpack_signed(int *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      int *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r8g8b8_sint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (int)pixel.r; /* r */
         dst[1] = (int)pixel.g; /* g */
         dst[2] = (int)pixel.b; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r8g8b8_sint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (int)pixel.r; /* r */
         dst[1] = (int)pixel.g; /* g */
         dst[2] = (int)pixel.b; /* b */
         dst[3] = 1; /* a */
#endif
         src += 3;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r8g8b8_sint_pack_signed(uint8_t *dst_row, unsigned dst_stride, const int *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r8g8b8_sint pixel;
         pixel.r = (int8_t)CLAMP(src[0], -128, 127);
         pixel.g = (int8_t)CLAMP(src[1], -128, 127);
         pixel.b = (int8_t)CLAMP(src[2], -128, 127);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r8g8b8_sint pixel;
         pixel.r = (int8_t)CLAMP(src[0], -128, 127);
         pixel.g = (int8_t)CLAMP(src[1], -128, 127);
         pixel.b = (int8_t)CLAMP(src[2], -128, 127);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 3;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r8g8b8_sint_fetch_signed(int *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r8g8b8_sint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (int)pixel.r; /* r */
         dst[1] = (int)pixel.g; /* g */
         dst[2] = (int)pixel.b; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r8g8b8_sint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (int)pixel.r; /* r */
         dst[1] = (int)pixel.g; /* g */
         dst[2] = (int)pixel.b; /* b */
         dst[3] = 1; /* a */
#endif
}

static inline void
util_format_r8g8b8_sint_unpack_unsigned(unsigned *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      unsigned *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r8g8b8_sint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (unsigned)MAX2(pixel.r, 0); /* r */
         dst[1] = (unsigned)MAX2(pixel.g, 0); /* g */
         dst[2] = (unsigned)MAX2(pixel.b, 0); /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r8g8b8_sint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (unsigned)MAX2(pixel.r, 0); /* r */
         dst[1] = (unsigned)MAX2(pixel.g, 0); /* g */
         dst[2] = (unsigned)MAX2(pixel.b, 0); /* b */
         dst[3] = 1; /* a */
#endif
         src += 3;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r8g8b8_sint_pack_unsigned(uint8_t *dst_row, unsigned dst_stride, const unsigned *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r8g8b8_sint pixel;
         pixel.r = (int8_t)MIN2(src[0], 127);
         pixel.g = (int8_t)MIN2(src[1], 127);
         pixel.b = (int8_t)MIN2(src[2], 127);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r8g8b8_sint pixel;
         pixel.r = (int8_t)MIN2(src[0], 127);
         pixel.g = (int8_t)MIN2(src[1], 127);
         pixel.b = (int8_t)MIN2(src[2], 127);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 3;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r8g8b8a8_sint_unpack_signed(int *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      int *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t r;
         int32_t g;
         int32_t b;
         int32_t a;
         r = ((int32_t)(value) ) >> 24;
         g = ((int32_t)(value << 8) ) >> 24;
         b = ((int32_t)(value << 16) ) >> 24;
         a = ((int32_t)(value << 24) ) >> 24;
         dst[0] = (int)r; /* r */
         dst[1] = (int)g; /* g */
         dst[2] = (int)b; /* b */
         dst[3] = (int)a; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t r;
         int32_t g;
         int32_t b;
         int32_t a;
         r = ((int32_t)(value << 24) ) >> 24;
         g = ((int32_t)(value << 16) ) >> 24;
         b = ((int32_t)(value << 8) ) >> 24;
         a = ((int32_t)(value) ) >> 24;
         dst[0] = (int)r; /* r */
         dst[1] = (int)g; /* g */
         dst[2] = (int)b; /* b */
         dst[3] = (int)a; /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r8g8b8a8_sint_pack_signed(uint8_t *dst_row, unsigned dst_stride, const int *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)((int8_t)CLAMP(src[0], -128, 127)) << 24) ;
         value |= (uint32_t)((uint32_t)(((int8_t)CLAMP(src[1], -128, 127)) & 0xff) << 16) ;
         value |= (uint32_t)((uint32_t)(((int8_t)CLAMP(src[2], -128, 127)) & 0xff) << 8) ;
         value |= (uint32_t)(((int8_t)CLAMP(src[3], -128, 127)) & 0xff) ;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (uint32_t)(((int8_t)CLAMP(src[0], -128, 127)) & 0xff) ;
         value |= (uint32_t)((uint32_t)(((int8_t)CLAMP(src[1], -128, 127)) & 0xff) << 8) ;
         value |= (uint32_t)((uint32_t)(((int8_t)CLAMP(src[2], -128, 127)) & 0xff) << 16) ;
         value |= (uint32_t)((uint32_t)((int8_t)CLAMP(src[3], -128, 127)) << 24) ;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r8g8b8a8_sint_fetch_signed(int *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t r;
         int32_t g;
         int32_t b;
         int32_t a;
         r = ((int32_t)(value) ) >> 24;
         g = ((int32_t)(value << 8) ) >> 24;
         b = ((int32_t)(value << 16) ) >> 24;
         a = ((int32_t)(value << 24) ) >> 24;
         dst[0] = (int)r; /* r */
         dst[1] = (int)g; /* g */
         dst[2] = (int)b; /* b */
         dst[3] = (int)a; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t r;
         int32_t g;
         int32_t b;
         int32_t a;
         r = ((int32_t)(value << 24) ) >> 24;
         g = ((int32_t)(value << 16) ) >> 24;
         b = ((int32_t)(value << 8) ) >> 24;
         a = ((int32_t)(value) ) >> 24;
         dst[0] = (int)r; /* r */
         dst[1] = (int)g; /* g */
         dst[2] = (int)b; /* b */
         dst[3] = (int)a; /* a */
#endif
}

static inline void
util_format_r8g8b8a8_sint_unpack_unsigned(unsigned *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      unsigned *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t r;
         int32_t g;
         int32_t b;
         int32_t a;
         r = ((int32_t)(value) ) >> 24;
         g = ((int32_t)(value << 8) ) >> 24;
         b = ((int32_t)(value << 16) ) >> 24;
         a = ((int32_t)(value << 24) ) >> 24;
         dst[0] = (unsigned)MAX2(r, 0); /* r */
         dst[1] = (unsigned)MAX2(g, 0); /* g */
         dst[2] = (unsigned)MAX2(b, 0); /* b */
         dst[3] = (unsigned)MAX2(a, 0); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t r;
         int32_t g;
         int32_t b;
         int32_t a;
         r = ((int32_t)(value << 24) ) >> 24;
         g = ((int32_t)(value << 16) ) >> 24;
         b = ((int32_t)(value << 8) ) >> 24;
         a = ((int32_t)(value) ) >> 24;
         dst[0] = (unsigned)MAX2(r, 0); /* r */
         dst[1] = (unsigned)MAX2(g, 0); /* g */
         dst[2] = (unsigned)MAX2(b, 0); /* b */
         dst[3] = (unsigned)MAX2(a, 0); /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r8g8b8a8_sint_pack_unsigned(uint8_t *dst_row, unsigned dst_stride, const unsigned *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)((int8_t)MIN2(src[0], 127)) << 24) ;
         value |= (uint32_t)((uint32_t)(((int8_t)MIN2(src[1], 127)) & 0xff) << 16) ;
         value |= (uint32_t)((uint32_t)(((int8_t)MIN2(src[2], 127)) & 0xff) << 8) ;
         value |= (uint32_t)(((int8_t)MIN2(src[3], 127)) & 0xff) ;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (uint32_t)(((int8_t)MIN2(src[0], 127)) & 0xff) ;
         value |= (uint32_t)((uint32_t)(((int8_t)MIN2(src[1], 127)) & 0xff) << 8) ;
         value |= (uint32_t)((uint32_t)(((int8_t)MIN2(src[2], 127)) & 0xff) << 16) ;
         value |= (uint32_t)((uint32_t)((int8_t)MIN2(src[3], 127)) << 24) ;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r16_uint_unpack_unsigned(unsigned *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      unsigned *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = *(const uint16_t *)src;
         uint16_t r;
         r = value;
         dst[0] = (unsigned)r; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
         src += 2;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r16_uint_pack_unsigned(uint8_t *dst_row, unsigned dst_stride, const unsigned *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = 0;
         value |= (uint16_t)MIN2(src[0], 65535);
         *(uint16_t *)dst = value;
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r16_uint_fetch_unsigned(unsigned *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
         uint16_t value = *(const uint16_t *)src;
         uint16_t r;
         r = value;
         dst[0] = (unsigned)r; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
}

static inline void
util_format_r16_uint_unpack_signed(int *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      int *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = *(const uint16_t *)src;
         uint16_t r;
         r = value;
         dst[0] = (int)r; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
         src += 2;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r16_uint_pack_signed(uint8_t *dst_row, unsigned dst_stride, const int *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = 0;
         value |= (uint16_t)CLAMP(src[0], 0, 65535);
         *(uint16_t *)dst = value;
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r16g16_uint_unpack_unsigned(unsigned *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      unsigned *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t r;
         uint32_t g;
         r = value >> 16;
         g = (value) & 0xffff;
         dst[0] = (unsigned)r; /* r */
         dst[1] = (unsigned)g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t r;
         uint32_t g;
         r = (value) & 0xffff;
         g = value >> 16;
         dst[0] = (unsigned)r; /* r */
         dst[1] = (unsigned)g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r16g16_uint_pack_unsigned(uint8_t *dst_row, unsigned dst_stride, const unsigned *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint16_t)MIN2(src[0], 65535)) << 16;
         value |= ((uint16_t)MIN2(src[1], 65535)) & 0xffff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= ((uint16_t)MIN2(src[0], 65535)) & 0xffff;
         value |= (uint32_t)((uint16_t)MIN2(src[1], 65535)) << 16;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r16g16_uint_fetch_unsigned(unsigned *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t r;
         uint32_t g;
         r = value >> 16;
         g = (value) & 0xffff;
         dst[0] = (unsigned)r; /* r */
         dst[1] = (unsigned)g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t r;
         uint32_t g;
         r = (value) & 0xffff;
         g = value >> 16;
         dst[0] = (unsigned)r; /* r */
         dst[1] = (unsigned)g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#endif
}

static inline void
util_format_r16g16_uint_unpack_signed(int *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      int *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t r;
         uint32_t g;
         r = value >> 16;
         g = (value) & 0xffff;
         dst[0] = (int)r; /* r */
         dst[1] = (int)g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t r;
         uint32_t g;
         r = (value) & 0xffff;
         g = value >> 16;
         dst[0] = (int)r; /* r */
         dst[1] = (int)g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r16g16_uint_pack_signed(uint8_t *dst_row, unsigned dst_stride, const int *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint16_t)CLAMP(src[0], 0, 65535)) << 16;
         value |= ((uint16_t)CLAMP(src[1], 0, 65535)) & 0xffff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= ((uint16_t)CLAMP(src[0], 0, 65535)) & 0xffff;
         value |= (uint32_t)((uint16_t)CLAMP(src[1], 0, 65535)) << 16;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r16g16b16_uint {
#if UTIL_ARCH_BIG_ENDIAN
   uint16_t r;
   uint16_t g;
   uint16_t b;
#else
   uint16_t r;
   uint16_t g;
   uint16_t b;
#endif
};

static inline void
util_format_r16g16b16_uint_unpack_unsigned(unsigned *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      unsigned *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16_uint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (unsigned)pixel.r; /* r */
         dst[1] = (unsigned)pixel.g; /* g */
         dst[2] = (unsigned)pixel.b; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r16g16b16_uint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (unsigned)pixel.r; /* r */
         dst[1] = (unsigned)pixel.g; /* g */
         dst[2] = (unsigned)pixel.b; /* b */
         dst[3] = 1; /* a */
#endif
         src += 6;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r16g16b16_uint_pack_unsigned(uint8_t *dst_row, unsigned dst_stride, const unsigned *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16_uint pixel;
         pixel.r = (uint16_t)MIN2(src[0], 65535);
         pixel.g = (uint16_t)MIN2(src[1], 65535);
         pixel.b = (uint16_t)MIN2(src[2], 65535);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r16g16b16_uint pixel;
         pixel.r = (uint16_t)MIN2(src[0], 65535);
         pixel.g = (uint16_t)MIN2(src[1], 65535);
         pixel.b = (uint16_t)MIN2(src[2], 65535);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 6;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r16g16b16_uint_fetch_unsigned(unsigned *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16_uint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (unsigned)pixel.r; /* r */
         dst[1] = (unsigned)pixel.g; /* g */
         dst[2] = (unsigned)pixel.b; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r16g16b16_uint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (unsigned)pixel.r; /* r */
         dst[1] = (unsigned)pixel.g; /* g */
         dst[2] = (unsigned)pixel.b; /* b */
         dst[3] = 1; /* a */
#endif
}

static inline void
util_format_r16g16b16_uint_unpack_signed(int *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      int *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16_uint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (int)pixel.r; /* r */
         dst[1] = (int)pixel.g; /* g */
         dst[2] = (int)pixel.b; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r16g16b16_uint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (int)pixel.r; /* r */
         dst[1] = (int)pixel.g; /* g */
         dst[2] = (int)pixel.b; /* b */
         dst[3] = 1; /* a */
#endif
         src += 6;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r16g16b16_uint_pack_signed(uint8_t *dst_row, unsigned dst_stride, const int *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16_uint pixel;
         pixel.r = (uint16_t)CLAMP(src[0], 0, 65535);
         pixel.g = (uint16_t)CLAMP(src[1], 0, 65535);
         pixel.b = (uint16_t)CLAMP(src[2], 0, 65535);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r16g16b16_uint pixel;
         pixel.r = (uint16_t)CLAMP(src[0], 0, 65535);
         pixel.g = (uint16_t)CLAMP(src[1], 0, 65535);
         pixel.b = (uint16_t)CLAMP(src[2], 0, 65535);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 6;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r16g16b16a16_uint {
#if UTIL_ARCH_BIG_ENDIAN
   uint16_t r;
   uint16_t g;
   uint16_t b;
   uint16_t a;
#else
   uint16_t r;
   uint16_t g;
   uint16_t b;
   uint16_t a;
#endif
};

static inline void
util_format_r16g16b16a16_uint_unpack_unsigned(unsigned *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      unsigned *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16a16_uint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (unsigned)pixel.r; /* r */
         dst[1] = (unsigned)pixel.g; /* g */
         dst[2] = (unsigned)pixel.b; /* b */
         dst[3] = (unsigned)pixel.a; /* a */
#else
         struct util_format_r16g16b16a16_uint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (unsigned)pixel.r; /* r */
         dst[1] = (unsigned)pixel.g; /* g */
         dst[2] = (unsigned)pixel.b; /* b */
         dst[3] = (unsigned)pixel.a; /* a */
#endif
         src += 8;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r16g16b16a16_uint_pack_unsigned(uint8_t *dst_row, unsigned dst_stride, const unsigned *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16a16_uint pixel;
         pixel.r = (uint16_t)MIN2(src[0], 65535);
         pixel.g = (uint16_t)MIN2(src[1], 65535);
         pixel.b = (uint16_t)MIN2(src[2], 65535);
         pixel.a = (uint16_t)MIN2(src[3], 65535);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r16g16b16a16_uint pixel;
         pixel.r = (uint16_t)MIN2(src[0], 65535);
         pixel.g = (uint16_t)MIN2(src[1], 65535);
         pixel.b = (uint16_t)MIN2(src[2], 65535);
         pixel.a = (uint16_t)MIN2(src[3], 65535);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 8;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r16g16b16a16_uint_fetch_unsigned(unsigned *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16a16_uint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (unsigned)pixel.r; /* r */
         dst[1] = (unsigned)pixel.g; /* g */
         dst[2] = (unsigned)pixel.b; /* b */
         dst[3] = (unsigned)pixel.a; /* a */
#else
         struct util_format_r16g16b16a16_uint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (unsigned)pixel.r; /* r */
         dst[1] = (unsigned)pixel.g; /* g */
         dst[2] = (unsigned)pixel.b; /* b */
         dst[3] = (unsigned)pixel.a; /* a */
#endif
}

static inline void
util_format_r16g16b16a16_uint_unpack_signed(int *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      int *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16a16_uint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (int)pixel.r; /* r */
         dst[1] = (int)pixel.g; /* g */
         dst[2] = (int)pixel.b; /* b */
         dst[3] = (int)pixel.a; /* a */
#else
         struct util_format_r16g16b16a16_uint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (int)pixel.r; /* r */
         dst[1] = (int)pixel.g; /* g */
         dst[2] = (int)pixel.b; /* b */
         dst[3] = (int)pixel.a; /* a */
#endif
         src += 8;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r16g16b16a16_uint_pack_signed(uint8_t *dst_row, unsigned dst_stride, const int *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16a16_uint pixel;
         pixel.r = (uint16_t)CLAMP(src[0], 0, 65535);
         pixel.g = (uint16_t)CLAMP(src[1], 0, 65535);
         pixel.b = (uint16_t)CLAMP(src[2], 0, 65535);
         pixel.a = (uint16_t)CLAMP(src[3], 0, 65535);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r16g16b16a16_uint pixel;
         pixel.r = (uint16_t)CLAMP(src[0], 0, 65535);
         pixel.g = (uint16_t)CLAMP(src[1], 0, 65535);
         pixel.b = (uint16_t)CLAMP(src[2], 0, 65535);
         pixel.a = (uint16_t)CLAMP(src[3], 0, 65535);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 8;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r16_sint_unpack_signed(int *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      int *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = *(const uint16_t *)src;
         int16_t r;
         r = (int16_t)(value) ;
         dst[0] = (int)r; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
         src += 2;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r16_sint_pack_signed(uint8_t *dst_row, unsigned dst_stride, const int *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = 0;
         value |= (uint16_t)((int16_t)CLAMP(src[0], -32768, 32767)) ;
         *(uint16_t *)dst = value;
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r16_sint_fetch_signed(int *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
         uint16_t value = *(const uint16_t *)src;
         int16_t r;
         r = (int16_t)(value) ;
         dst[0] = (int)r; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
}

static inline void
util_format_r16_sint_unpack_unsigned(unsigned *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      unsigned *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = *(const uint16_t *)src;
         int16_t r;
         r = (int16_t)(value) ;
         dst[0] = (unsigned)MAX2(r, 0); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
         src += 2;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r16_sint_pack_unsigned(uint8_t *dst_row, unsigned dst_stride, const unsigned *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = 0;
         value |= (uint16_t)((int16_t)MIN2(src[0], 32767)) ;
         *(uint16_t *)dst = value;
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r16g16_sint_unpack_signed(int *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      int *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t r;
         int32_t g;
         r = ((int32_t)(value) ) >> 16;
         g = ((int32_t)(value << 16) ) >> 16;
         dst[0] = (int)r; /* r */
         dst[1] = (int)g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t r;
         int32_t g;
         r = ((int32_t)(value << 16) ) >> 16;
         g = ((int32_t)(value) ) >> 16;
         dst[0] = (int)r; /* r */
         dst[1] = (int)g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r16g16_sint_pack_signed(uint8_t *dst_row, unsigned dst_stride, const int *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)((int16_t)CLAMP(src[0], -32768, 32767)) << 16) ;
         value |= (uint32_t)(((int16_t)CLAMP(src[1], -32768, 32767)) & 0xffff) ;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (uint32_t)(((int16_t)CLAMP(src[0], -32768, 32767)) & 0xffff) ;
         value |= (uint32_t)((uint32_t)((int16_t)CLAMP(src[1], -32768, 32767)) << 16) ;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r16g16_sint_fetch_signed(int *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t r;
         int32_t g;
         r = ((int32_t)(value) ) >> 16;
         g = ((int32_t)(value << 16) ) >> 16;
         dst[0] = (int)r; /* r */
         dst[1] = (int)g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t r;
         int32_t g;
         r = ((int32_t)(value << 16) ) >> 16;
         g = ((int32_t)(value) ) >> 16;
         dst[0] = (int)r; /* r */
         dst[1] = (int)g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#endif
}

static inline void
util_format_r16g16_sint_unpack_unsigned(unsigned *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      unsigned *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t r;
         int32_t g;
         r = ((int32_t)(value) ) >> 16;
         g = ((int32_t)(value << 16) ) >> 16;
         dst[0] = (unsigned)MAX2(r, 0); /* r */
         dst[1] = (unsigned)MAX2(g, 0); /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t r;
         int32_t g;
         r = ((int32_t)(value << 16) ) >> 16;
         g = ((int32_t)(value) ) >> 16;
         dst[0] = (unsigned)MAX2(r, 0); /* r */
         dst[1] = (unsigned)MAX2(g, 0); /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r16g16_sint_pack_unsigned(uint8_t *dst_row, unsigned dst_stride, const unsigned *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)((int16_t)MIN2(src[0], 32767)) << 16) ;
         value |= (uint32_t)(((int16_t)MIN2(src[1], 32767)) & 0xffff) ;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (uint32_t)(((int16_t)MIN2(src[0], 32767)) & 0xffff) ;
         value |= (uint32_t)((uint32_t)((int16_t)MIN2(src[1], 32767)) << 16) ;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r16g16b16_sint {
#if UTIL_ARCH_BIG_ENDIAN
   int16_t r;
   int16_t g;
   int16_t b;
#else
   int16_t r;
   int16_t g;
   int16_t b;
#endif
};

static inline void
util_format_r16g16b16_sint_unpack_signed(int *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      int *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16_sint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (int)pixel.r; /* r */
         dst[1] = (int)pixel.g; /* g */
         dst[2] = (int)pixel.b; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r16g16b16_sint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (int)pixel.r; /* r */
         dst[1] = (int)pixel.g; /* g */
         dst[2] = (int)pixel.b; /* b */
         dst[3] = 1; /* a */
#endif
         src += 6;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r16g16b16_sint_pack_signed(uint8_t *dst_row, unsigned dst_stride, const int *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16_sint pixel;
         pixel.r = (int16_t)CLAMP(src[0], -32768, 32767);
         pixel.g = (int16_t)CLAMP(src[1], -32768, 32767);
         pixel.b = (int16_t)CLAMP(src[2], -32768, 32767);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r16g16b16_sint pixel;
         pixel.r = (int16_t)CLAMP(src[0], -32768, 32767);
         pixel.g = (int16_t)CLAMP(src[1], -32768, 32767);
         pixel.b = (int16_t)CLAMP(src[2], -32768, 32767);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 6;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r16g16b16_sint_fetch_signed(int *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16_sint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (int)pixel.r; /* r */
         dst[1] = (int)pixel.g; /* g */
         dst[2] = (int)pixel.b; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r16g16b16_sint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (int)pixel.r; /* r */
         dst[1] = (int)pixel.g; /* g */
         dst[2] = (int)pixel.b; /* b */
         dst[3] = 1; /* a */
#endif
}

static inline void
util_format_r16g16b16_sint_unpack_unsigned(unsigned *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      unsigned *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16_sint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (unsigned)MAX2(pixel.r, 0); /* r */
         dst[1] = (unsigned)MAX2(pixel.g, 0); /* g */
         dst[2] = (unsigned)MAX2(pixel.b, 0); /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r16g16b16_sint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (unsigned)MAX2(pixel.r, 0); /* r */
         dst[1] = (unsigned)MAX2(pixel.g, 0); /* g */
         dst[2] = (unsigned)MAX2(pixel.b, 0); /* b */
         dst[3] = 1; /* a */
#endif
         src += 6;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r16g16b16_sint_pack_unsigned(uint8_t *dst_row, unsigned dst_stride, const unsigned *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16_sint pixel;
         pixel.r = (int16_t)MIN2(src[0], 32767);
         pixel.g = (int16_t)MIN2(src[1], 32767);
         pixel.b = (int16_t)MIN2(src[2], 32767);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r16g16b16_sint pixel;
         pixel.r = (int16_t)MIN2(src[0], 32767);
         pixel.g = (int16_t)MIN2(src[1], 32767);
         pixel.b = (int16_t)MIN2(src[2], 32767);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 6;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r16g16b16a16_sint {
#if UTIL_ARCH_BIG_ENDIAN
   int16_t r;
   int16_t g;
   int16_t b;
   int16_t a;
#else
   int16_t r;
   int16_t g;
   int16_t b;
   int16_t a;
#endif
};

static inline void
util_format_r16g16b16a16_sint_unpack_signed(int *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      int *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16a16_sint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (int)pixel.r; /* r */
         dst[1] = (int)pixel.g; /* g */
         dst[2] = (int)pixel.b; /* b */
         dst[3] = (int)pixel.a; /* a */
#else
         struct util_format_r16g16b16a16_sint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (int)pixel.r; /* r */
         dst[1] = (int)pixel.g; /* g */
         dst[2] = (int)pixel.b; /* b */
         dst[3] = (int)pixel.a; /* a */
#endif
         src += 8;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r16g16b16a16_sint_pack_signed(uint8_t *dst_row, unsigned dst_stride, const int *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16a16_sint pixel;
         pixel.r = (int16_t)CLAMP(src[0], -32768, 32767);
         pixel.g = (int16_t)CLAMP(src[1], -32768, 32767);
         pixel.b = (int16_t)CLAMP(src[2], -32768, 32767);
         pixel.a = (int16_t)CLAMP(src[3], -32768, 32767);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r16g16b16a16_sint pixel;
         pixel.r = (int16_t)CLAMP(src[0], -32768, 32767);
         pixel.g = (int16_t)CLAMP(src[1], -32768, 32767);
         pixel.b = (int16_t)CLAMP(src[2], -32768, 32767);
         pixel.a = (int16_t)CLAMP(src[3], -32768, 32767);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 8;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r16g16b16a16_sint_fetch_signed(int *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16a16_sint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (int)pixel.r; /* r */
         dst[1] = (int)pixel.g; /* g */
         dst[2] = (int)pixel.b; /* b */
         dst[3] = (int)pixel.a; /* a */
#else
         struct util_format_r16g16b16a16_sint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (int)pixel.r; /* r */
         dst[1] = (int)pixel.g; /* g */
         dst[2] = (int)pixel.b; /* b */
         dst[3] = (int)pixel.a; /* a */
#endif
}

static inline void
util_format_r16g16b16a16_sint_unpack_unsigned(unsigned *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      unsigned *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16a16_sint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (unsigned)MAX2(pixel.r, 0); /* r */
         dst[1] = (unsigned)MAX2(pixel.g, 0); /* g */
         dst[2] = (unsigned)MAX2(pixel.b, 0); /* b */
         dst[3] = (unsigned)MAX2(pixel.a, 0); /* a */
#else
         struct util_format_r16g16b16a16_sint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (unsigned)MAX2(pixel.r, 0); /* r */
         dst[1] = (unsigned)MAX2(pixel.g, 0); /* g */
         dst[2] = (unsigned)MAX2(pixel.b, 0); /* b */
         dst[3] = (unsigned)MAX2(pixel.a, 0); /* a */
#endif
         src += 8;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r16g16b16a16_sint_pack_unsigned(uint8_t *dst_row, unsigned dst_stride, const unsigned *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r16g16b16a16_sint pixel;
         pixel.r = (int16_t)MIN2(src[0], 32767);
         pixel.g = (int16_t)MIN2(src[1], 32767);
         pixel.b = (int16_t)MIN2(src[2], 32767);
         pixel.a = (int16_t)MIN2(src[3], 32767);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r16g16b16a16_sint pixel;
         pixel.r = (int16_t)MIN2(src[0], 32767);
         pixel.g = (int16_t)MIN2(src[1], 32767);
         pixel.b = (int16_t)MIN2(src[2], 32767);
         pixel.a = (int16_t)MIN2(src[3], 32767);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 8;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r32_uint_unpack_unsigned(unsigned *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      unsigned *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         uint32_t value = *(const uint32_t *)src;
         uint32_t r;
         r = value;
         dst[0] = r; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r32_uint_pack_unsigned(uint8_t *dst_row, unsigned dst_stride, const unsigned *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint32_t value = 0;
         value |= src[0];
         *(uint32_t *)dst = value;
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r32_uint_fetch_unsigned(unsigned *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
         uint32_t value = *(const uint32_t *)src;
         uint32_t r;
         r = value;
         dst[0] = r; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
}

static inline void
util_format_r32_uint_unpack_signed(int *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      int *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         uint32_t value = *(const uint32_t *)src;
         uint32_t r;
         r = value;
         dst[0] = (int)MIN2(r, 2147483647); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r32_uint_pack_signed(uint8_t *dst_row, unsigned dst_stride, const int *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint32_t value = 0;
         value |= (uint32_t)MAX2(src[0], 0);
         *(uint32_t *)dst = value;
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r32g32_uint {
#if UTIL_ARCH_BIG_ENDIAN
   uint32_t r;
   uint32_t g;
#else
   uint32_t r;
   uint32_t g;
#endif
};

static inline void
util_format_r32g32_uint_unpack_unsigned(unsigned *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      unsigned *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32_uint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.r; /* r */
         dst[1] = pixel.g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r32g32_uint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.r; /* r */
         dst[1] = pixel.g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#endif
         src += 8;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r32g32_uint_pack_unsigned(uint8_t *dst_row, unsigned dst_stride, const unsigned *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32_uint pixel;
         pixel.r = src[0];
         pixel.g = src[1];
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r32g32_uint pixel;
         pixel.r = src[0];
         pixel.g = src[1];
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 8;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r32g32_uint_fetch_unsigned(unsigned *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32_uint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.r; /* r */
         dst[1] = pixel.g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r32g32_uint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.r; /* r */
         dst[1] = pixel.g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#endif
}

static inline void
util_format_r32g32_uint_unpack_signed(int *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      int *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32_uint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (int)MIN2(pixel.r, 2147483647); /* r */
         dst[1] = (int)MIN2(pixel.g, 2147483647); /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r32g32_uint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (int)MIN2(pixel.r, 2147483647); /* r */
         dst[1] = (int)MIN2(pixel.g, 2147483647); /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#endif
         src += 8;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r32g32_uint_pack_signed(uint8_t *dst_row, unsigned dst_stride, const int *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32_uint pixel;
         pixel.r = (uint32_t)MAX2(src[0], 0);
         pixel.g = (uint32_t)MAX2(src[1], 0);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r32g32_uint pixel;
         pixel.r = (uint32_t)MAX2(src[0], 0);
         pixel.g = (uint32_t)MAX2(src[1], 0);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 8;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r32g32b32_uint {
#if UTIL_ARCH_BIG_ENDIAN
   uint32_t r;
   uint32_t g;
   uint32_t b;
#else
   uint32_t r;
   uint32_t g;
   uint32_t b;
#endif
};

static inline void
util_format_r32g32b32_uint_unpack_unsigned(unsigned *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      unsigned *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32_uint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.r; /* r */
         dst[1] = pixel.g; /* g */
         dst[2] = pixel.b; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r32g32b32_uint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.r; /* r */
         dst[1] = pixel.g; /* g */
         dst[2] = pixel.b; /* b */
         dst[3] = 1; /* a */
#endif
         src += 12;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r32g32b32_uint_pack_unsigned(uint8_t *dst_row, unsigned dst_stride, const unsigned *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32_uint pixel;
         pixel.r = src[0];
         pixel.g = src[1];
         pixel.b = src[2];
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r32g32b32_uint pixel;
         pixel.r = src[0];
         pixel.g = src[1];
         pixel.b = src[2];
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 12;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r32g32b32_uint_fetch_unsigned(unsigned *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32_uint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.r; /* r */
         dst[1] = pixel.g; /* g */
         dst[2] = pixel.b; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r32g32b32_uint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.r; /* r */
         dst[1] = pixel.g; /* g */
         dst[2] = pixel.b; /* b */
         dst[3] = 1; /* a */
#endif
}

static inline void
util_format_r32g32b32_uint_unpack_signed(int *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      int *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32_uint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (int)MIN2(pixel.r, 2147483647); /* r */
         dst[1] = (int)MIN2(pixel.g, 2147483647); /* g */
         dst[2] = (int)MIN2(pixel.b, 2147483647); /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r32g32b32_uint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (int)MIN2(pixel.r, 2147483647); /* r */
         dst[1] = (int)MIN2(pixel.g, 2147483647); /* g */
         dst[2] = (int)MIN2(pixel.b, 2147483647); /* b */
         dst[3] = 1; /* a */
#endif
         src += 12;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r32g32b32_uint_pack_signed(uint8_t *dst_row, unsigned dst_stride, const int *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32_uint pixel;
         pixel.r = (uint32_t)MAX2(src[0], 0);
         pixel.g = (uint32_t)MAX2(src[1], 0);
         pixel.b = (uint32_t)MAX2(src[2], 0);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r32g32b32_uint pixel;
         pixel.r = (uint32_t)MAX2(src[0], 0);
         pixel.g = (uint32_t)MAX2(src[1], 0);
         pixel.b = (uint32_t)MAX2(src[2], 0);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 12;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r32g32b32a32_uint {
#if UTIL_ARCH_BIG_ENDIAN
   uint32_t r;
   uint32_t g;
   uint32_t b;
   uint32_t a;
#else
   uint32_t r;
   uint32_t g;
   uint32_t b;
   uint32_t a;
#endif
};

static inline void
util_format_r32g32b32a32_uint_unpack_unsigned(unsigned *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      unsigned *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32a32_uint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.r; /* r */
         dst[1] = pixel.g; /* g */
         dst[2] = pixel.b; /* b */
         dst[3] = pixel.a; /* a */
#else
         struct util_format_r32g32b32a32_uint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.r; /* r */
         dst[1] = pixel.g; /* g */
         dst[2] = pixel.b; /* b */
         dst[3] = pixel.a; /* a */
#endif
         src += 16;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r32g32b32a32_uint_pack_unsigned(uint8_t *dst_row, unsigned dst_stride, const unsigned *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32a32_uint pixel;
         pixel.r = src[0];
         pixel.g = src[1];
         pixel.b = src[2];
         pixel.a = src[3];
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r32g32b32a32_uint pixel;
         pixel.r = src[0];
         pixel.g = src[1];
         pixel.b = src[2];
         pixel.a = src[3];
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 16;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r32g32b32a32_uint_fetch_unsigned(unsigned *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32a32_uint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.r; /* r */
         dst[1] = pixel.g; /* g */
         dst[2] = pixel.b; /* b */
         dst[3] = pixel.a; /* a */
#else
         struct util_format_r32g32b32a32_uint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.r; /* r */
         dst[1] = pixel.g; /* g */
         dst[2] = pixel.b; /* b */
         dst[3] = pixel.a; /* a */
#endif
}

static inline void
util_format_r32g32b32a32_uint_unpack_signed(int *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      int *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32a32_uint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (int)MIN2(pixel.r, 2147483647); /* r */
         dst[1] = (int)MIN2(pixel.g, 2147483647); /* g */
         dst[2] = (int)MIN2(pixel.b, 2147483647); /* b */
         dst[3] = (int)MIN2(pixel.a, 2147483647); /* a */
#else
         struct util_format_r32g32b32a32_uint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (int)MIN2(pixel.r, 2147483647); /* r */
         dst[1] = (int)MIN2(pixel.g, 2147483647); /* g */
         dst[2] = (int)MIN2(pixel.b, 2147483647); /* b */
         dst[3] = (int)MIN2(pixel.a, 2147483647); /* a */
#endif
         src += 16;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r32g32b32a32_uint_pack_signed(uint8_t *dst_row, unsigned dst_stride, const int *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32a32_uint pixel;
         pixel.r = (uint32_t)MAX2(src[0], 0);
         pixel.g = (uint32_t)MAX2(src[1], 0);
         pixel.b = (uint32_t)MAX2(src[2], 0);
         pixel.a = (uint32_t)MAX2(src[3], 0);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r32g32b32a32_uint pixel;
         pixel.r = (uint32_t)MAX2(src[0], 0);
         pixel.g = (uint32_t)MAX2(src[1], 0);
         pixel.b = (uint32_t)MAX2(src[2], 0);
         pixel.a = (uint32_t)MAX2(src[3], 0);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 16;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r32_sint_unpack_signed(int *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      int *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         uint32_t value = *(const uint32_t *)src;
         int32_t r;
         r = (int32_t)(value) ;
         dst[0] = r; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r32_sint_pack_signed(uint8_t *dst_row, unsigned dst_stride, const int *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint32_t value = 0;
         value |= (uint32_t)(src[0]) ;
         *(uint32_t *)dst = value;
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r32_sint_fetch_signed(int *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
         uint32_t value = *(const uint32_t *)src;
         int32_t r;
         r = (int32_t)(value) ;
         dst[0] = r; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
}

static inline void
util_format_r32_sint_unpack_unsigned(unsigned *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      unsigned *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         uint32_t value = *(const uint32_t *)src;
         int32_t r;
         r = (int32_t)(value) ;
         dst[0] = (unsigned)MAX2(r, 0); /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r32_sint_pack_unsigned(uint8_t *dst_row, unsigned dst_stride, const unsigned *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint32_t value = 0;
         value |= (uint32_t)((int32_t)MIN2(src[0], 2147483647)) ;
         *(uint32_t *)dst = value;
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r32g32_sint {
#if UTIL_ARCH_BIG_ENDIAN
   int32_t r;
   int32_t g;
#else
   int32_t r;
   int32_t g;
#endif
};

static inline void
util_format_r32g32_sint_unpack_signed(int *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      int *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32_sint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.r; /* r */
         dst[1] = pixel.g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r32g32_sint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.r; /* r */
         dst[1] = pixel.g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#endif
         src += 8;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r32g32_sint_pack_signed(uint8_t *dst_row, unsigned dst_stride, const int *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32_sint pixel;
         pixel.r = src[0];
         pixel.g = src[1];
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r32g32_sint pixel;
         pixel.r = src[0];
         pixel.g = src[1];
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 8;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r32g32_sint_fetch_signed(int *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32_sint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.r; /* r */
         dst[1] = pixel.g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r32g32_sint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.r; /* r */
         dst[1] = pixel.g; /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#endif
}

static inline void
util_format_r32g32_sint_unpack_unsigned(unsigned *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      unsigned *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32_sint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (unsigned)MAX2(pixel.r, 0); /* r */
         dst[1] = (unsigned)MAX2(pixel.g, 0); /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r32g32_sint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (unsigned)MAX2(pixel.r, 0); /* r */
         dst[1] = (unsigned)MAX2(pixel.g, 0); /* g */
         dst[2] = 0; /* b */
         dst[3] = 1; /* a */
#endif
         src += 8;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r32g32_sint_pack_unsigned(uint8_t *dst_row, unsigned dst_stride, const unsigned *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32_sint pixel;
         pixel.r = (int32_t)MIN2(src[0], 2147483647);
         pixel.g = (int32_t)MIN2(src[1], 2147483647);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r32g32_sint pixel;
         pixel.r = (int32_t)MIN2(src[0], 2147483647);
         pixel.g = (int32_t)MIN2(src[1], 2147483647);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 8;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r32g32b32_sint {
#if UTIL_ARCH_BIG_ENDIAN
   int32_t r;
   int32_t g;
   int32_t b;
#else
   int32_t r;
   int32_t g;
   int32_t b;
#endif
};

static inline void
util_format_r32g32b32_sint_unpack_signed(int *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      int *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32_sint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.r; /* r */
         dst[1] = pixel.g; /* g */
         dst[2] = pixel.b; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r32g32b32_sint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.r; /* r */
         dst[1] = pixel.g; /* g */
         dst[2] = pixel.b; /* b */
         dst[3] = 1; /* a */
#endif
         src += 12;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r32g32b32_sint_pack_signed(uint8_t *dst_row, unsigned dst_stride, const int *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32_sint pixel;
         pixel.r = src[0];
         pixel.g = src[1];
         pixel.b = src[2];
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r32g32b32_sint pixel;
         pixel.r = src[0];
         pixel.g = src[1];
         pixel.b = src[2];
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 12;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r32g32b32_sint_fetch_signed(int *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32_sint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.r; /* r */
         dst[1] = pixel.g; /* g */
         dst[2] = pixel.b; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r32g32b32_sint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.r; /* r */
         dst[1] = pixel.g; /* g */
         dst[2] = pixel.b; /* b */
         dst[3] = 1; /* a */
#endif
}

static inline void
util_format_r32g32b32_sint_unpack_unsigned(unsigned *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      unsigned *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32_sint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (unsigned)MAX2(pixel.r, 0); /* r */
         dst[1] = (unsigned)MAX2(pixel.g, 0); /* g */
         dst[2] = (unsigned)MAX2(pixel.b, 0); /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_r32g32b32_sint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (unsigned)MAX2(pixel.r, 0); /* r */
         dst[1] = (unsigned)MAX2(pixel.g, 0); /* g */
         dst[2] = (unsigned)MAX2(pixel.b, 0); /* b */
         dst[3] = 1; /* a */
#endif
         src += 12;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r32g32b32_sint_pack_unsigned(uint8_t *dst_row, unsigned dst_stride, const unsigned *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32_sint pixel;
         pixel.r = (int32_t)MIN2(src[0], 2147483647);
         pixel.g = (int32_t)MIN2(src[1], 2147483647);
         pixel.b = (int32_t)MIN2(src[2], 2147483647);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r32g32b32_sint pixel;
         pixel.r = (int32_t)MIN2(src[0], 2147483647);
         pixel.g = (int32_t)MIN2(src[1], 2147483647);
         pixel.b = (int32_t)MIN2(src[2], 2147483647);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 12;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_r32g32b32a32_sint {
#if UTIL_ARCH_BIG_ENDIAN
   int32_t r;
   int32_t g;
   int32_t b;
   int32_t a;
#else
   int32_t r;
   int32_t g;
   int32_t b;
   int32_t a;
#endif
};

static inline void
util_format_r32g32b32a32_sint_unpack_signed(int *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      int *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32a32_sint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.r; /* r */
         dst[1] = pixel.g; /* g */
         dst[2] = pixel.b; /* b */
         dst[3] = pixel.a; /* a */
#else
         struct util_format_r32g32b32a32_sint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.r; /* r */
         dst[1] = pixel.g; /* g */
         dst[2] = pixel.b; /* b */
         dst[3] = pixel.a; /* a */
#endif
         src += 16;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r32g32b32a32_sint_pack_signed(uint8_t *dst_row, unsigned dst_stride, const int *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32a32_sint pixel;
         pixel.r = src[0];
         pixel.g = src[1];
         pixel.b = src[2];
         pixel.a = src[3];
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r32g32b32a32_sint pixel;
         pixel.r = src[0];
         pixel.g = src[1];
         pixel.b = src[2];
         pixel.a = src[3];
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 16;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_r32g32b32a32_sint_fetch_signed(int *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32a32_sint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.r; /* r */
         dst[1] = pixel.g; /* g */
         dst[2] = pixel.b; /* b */
         dst[3] = pixel.a; /* a */
#else
         struct util_format_r32g32b32a32_sint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.r; /* r */
         dst[1] = pixel.g; /* g */
         dst[2] = pixel.b; /* b */
         dst[3] = pixel.a; /* a */
#endif
}

static inline void
util_format_r32g32b32a32_sint_unpack_unsigned(unsigned *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      unsigned *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32a32_sint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (unsigned)MAX2(pixel.r, 0); /* r */
         dst[1] = (unsigned)MAX2(pixel.g, 0); /* g */
         dst[2] = (unsigned)MAX2(pixel.b, 0); /* b */
         dst[3] = (unsigned)MAX2(pixel.a, 0); /* a */
#else
         struct util_format_r32g32b32a32_sint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (unsigned)MAX2(pixel.r, 0); /* r */
         dst[1] = (unsigned)MAX2(pixel.g, 0); /* g */
         dst[2] = (unsigned)MAX2(pixel.b, 0); /* b */
         dst[3] = (unsigned)MAX2(pixel.a, 0); /* a */
#endif
         src += 16;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_r32g32b32a32_sint_pack_unsigned(uint8_t *dst_row, unsigned dst_stride, const unsigned *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_r32g32b32a32_sint pixel;
         pixel.r = (int32_t)MIN2(src[0], 2147483647);
         pixel.g = (int32_t)MIN2(src[1], 2147483647);
         pixel.b = (int32_t)MIN2(src[2], 2147483647);
         pixel.a = (int32_t)MIN2(src[3], 2147483647);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_r32g32b32a32_sint pixel;
         pixel.r = (int32_t)MIN2(src[0], 2147483647);
         pixel.g = (int32_t)MIN2(src[1], 2147483647);
         pixel.b = (int32_t)MIN2(src[2], 2147483647);
         pixel.a = (int32_t)MIN2(src[3], 2147483647);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 16;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_a8_uint_unpack_unsigned(unsigned *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      unsigned *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = *(const uint8_t *)src;
         uint8_t a;
         a = value;
         dst[0] = 0; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = (unsigned)a; /* a */
         src += 1;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_a8_uint_pack_unsigned(uint8_t *dst_row, unsigned dst_stride, const unsigned *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = 0;
         value |= (uint8_t)MIN2(src[3], 255);
         *(uint8_t *)dst = value;
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_a8_uint_fetch_unsigned(unsigned *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
         uint8_t value = *(const uint8_t *)src;
         uint8_t a;
         a = value;
         dst[0] = 0; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = (unsigned)a; /* a */
}

static inline void
util_format_a8_uint_unpack_signed(int *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      int *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = *(const uint8_t *)src;
         uint8_t a;
         a = value;
         dst[0] = 0; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = (int)a; /* a */
         src += 1;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_a8_uint_pack_signed(uint8_t *dst_row, unsigned dst_stride, const int *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = 0;
         value |= (uint8_t)CLAMP(src[3], 0, 255);
         *(uint8_t *)dst = value;
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_i8_uint_unpack_unsigned(unsigned *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      unsigned *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = *(const uint8_t *)src;
         uint8_t rgba;
         rgba = value;
         dst[0] = (unsigned)rgba; /* r */
         dst[1] = (unsigned)rgba; /* g */
         dst[2] = (unsigned)rgba; /* b */
         dst[3] = (unsigned)rgba; /* a */
         src += 1;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_i8_uint_pack_unsigned(uint8_t *dst_row, unsigned dst_stride, const unsigned *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = 0;
         value |= (uint8_t)MIN2(src[0], 255);
         *(uint8_t *)dst = value;
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_i8_uint_fetch_unsigned(unsigned *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
         uint8_t value = *(const uint8_t *)src;
         uint8_t rgba;
         rgba = value;
         dst[0] = (unsigned)rgba; /* r */
         dst[1] = (unsigned)rgba; /* g */
         dst[2] = (unsigned)rgba; /* b */
         dst[3] = (unsigned)rgba; /* a */
}

static inline void
util_format_i8_uint_unpack_signed(int *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      int *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = *(const uint8_t *)src;
         uint8_t rgba;
         rgba = value;
         dst[0] = (int)rgba; /* r */
         dst[1] = (int)rgba; /* g */
         dst[2] = (int)rgba; /* b */
         dst[3] = (int)rgba; /* a */
         src += 1;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_i8_uint_pack_signed(uint8_t *dst_row, unsigned dst_stride, const int *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = 0;
         value |= (uint8_t)CLAMP(src[0], 0, 255);
         *(uint8_t *)dst = value;
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_l8_uint_unpack_unsigned(unsigned *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      unsigned *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = *(const uint8_t *)src;
         uint8_t rgb;
         rgb = value;
         dst[0] = (unsigned)rgb; /* r */
         dst[1] = (unsigned)rgb; /* g */
         dst[2] = (unsigned)rgb; /* b */
         dst[3] = 1; /* a */
         src += 1;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_l8_uint_pack_unsigned(uint8_t *dst_row, unsigned dst_stride, const unsigned *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = 0;
         value |= (uint8_t)MIN2(src[0], 255);
         *(uint8_t *)dst = value;
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_l8_uint_fetch_unsigned(unsigned *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
         uint8_t value = *(const uint8_t *)src;
         uint8_t rgb;
         rgb = value;
         dst[0] = (unsigned)rgb; /* r */
         dst[1] = (unsigned)rgb; /* g */
         dst[2] = (unsigned)rgb; /* b */
         dst[3] = 1; /* a */
}

static inline void
util_format_l8_uint_unpack_signed(int *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      int *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = *(const uint8_t *)src;
         uint8_t rgb;
         rgb = value;
         dst[0] = (int)rgb; /* r */
         dst[1] = (int)rgb; /* g */
         dst[2] = (int)rgb; /* b */
         dst[3] = 1; /* a */
         src += 1;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_l8_uint_pack_signed(uint8_t *dst_row, unsigned dst_stride, const int *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = 0;
         value |= (uint8_t)CLAMP(src[0], 0, 255);
         *(uint8_t *)dst = value;
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_l8a8_uint_unpack_unsigned(unsigned *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      unsigned *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t rgb;
         uint16_t a;
         rgb = value >> 8;
         a = (value) & 0xff;
         dst[0] = (unsigned)rgb; /* r */
         dst[1] = (unsigned)rgb; /* g */
         dst[2] = (unsigned)rgb; /* b */
         dst[3] = (unsigned)a; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t rgb;
         uint16_t a;
         rgb = (value) & 0xff;
         a = value >> 8;
         dst[0] = (unsigned)rgb; /* r */
         dst[1] = (unsigned)rgb; /* g */
         dst[2] = (unsigned)rgb; /* b */
         dst[3] = (unsigned)a; /* a */
#endif
         src += 2;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_l8a8_uint_pack_unsigned(uint8_t *dst_row, unsigned dst_stride, const unsigned *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)((uint8_t)MIN2(src[0], 255)) << 8;
         value |= ((uint8_t)MIN2(src[3], 255)) & 0xff;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= ((uint8_t)MIN2(src[0], 255)) & 0xff;
         value |= (uint32_t)((uint8_t)MIN2(src[3], 255)) << 8;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_l8a8_uint_fetch_unsigned(unsigned *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t rgb;
         uint16_t a;
         rgb = value >> 8;
         a = (value) & 0xff;
         dst[0] = (unsigned)rgb; /* r */
         dst[1] = (unsigned)rgb; /* g */
         dst[2] = (unsigned)rgb; /* b */
         dst[3] = (unsigned)a; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t rgb;
         uint16_t a;
         rgb = (value) & 0xff;
         a = value >> 8;
         dst[0] = (unsigned)rgb; /* r */
         dst[1] = (unsigned)rgb; /* g */
         dst[2] = (unsigned)rgb; /* b */
         dst[3] = (unsigned)a; /* a */
#endif
}

static inline void
util_format_l8a8_uint_unpack_signed(int *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      int *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         uint16_t rgb;
         uint16_t a;
         rgb = value >> 8;
         a = (value) & 0xff;
         dst[0] = (int)rgb; /* r */
         dst[1] = (int)rgb; /* g */
         dst[2] = (int)rgb; /* b */
         dst[3] = (int)a; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         uint16_t rgb;
         uint16_t a;
         rgb = (value) & 0xff;
         a = value >> 8;
         dst[0] = (int)rgb; /* r */
         dst[1] = (int)rgb; /* g */
         dst[2] = (int)rgb; /* b */
         dst[3] = (int)a; /* a */
#endif
         src += 2;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_l8a8_uint_pack_signed(uint8_t *dst_row, unsigned dst_stride, const int *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint32_t)((uint8_t)CLAMP(src[0], 0, 255)) << 8;
         value |= ((uint8_t)CLAMP(src[3], 0, 255)) & 0xff;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= ((uint8_t)CLAMP(src[0], 0, 255)) & 0xff;
         value |= (uint32_t)((uint8_t)CLAMP(src[3], 0, 255)) << 8;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_a8_sint_unpack_signed(int *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      int *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = *(const uint8_t *)src;
         int8_t a;
         a = (int8_t)(value) ;
         dst[0] = 0; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = (int)a; /* a */
         src += 1;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_a8_sint_pack_signed(uint8_t *dst_row, unsigned dst_stride, const int *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = 0;
         value |= (uint8_t)((int8_t)CLAMP(src[3], -128, 127)) ;
         *(uint8_t *)dst = value;
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_a8_sint_fetch_signed(int *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
         uint8_t value = *(const uint8_t *)src;
         int8_t a;
         a = (int8_t)(value) ;
         dst[0] = 0; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = (int)a; /* a */
}

static inline void
util_format_a8_sint_unpack_unsigned(unsigned *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      unsigned *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = *(const uint8_t *)src;
         int8_t a;
         a = (int8_t)(value) ;
         dst[0] = 0; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = (unsigned)MAX2(a, 0); /* a */
         src += 1;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_a8_sint_pack_unsigned(uint8_t *dst_row, unsigned dst_stride, const unsigned *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = 0;
         value |= (uint8_t)((int8_t)MIN2(src[3], 127)) ;
         *(uint8_t *)dst = value;
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_i8_sint_unpack_signed(int *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      int *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = *(const uint8_t *)src;
         int8_t rgba;
         rgba = (int8_t)(value) ;
         dst[0] = (int)rgba; /* r */
         dst[1] = (int)rgba; /* g */
         dst[2] = (int)rgba; /* b */
         dst[3] = (int)rgba; /* a */
         src += 1;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_i8_sint_pack_signed(uint8_t *dst_row, unsigned dst_stride, const int *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = 0;
         value |= (uint8_t)((int8_t)CLAMP(src[0], -128, 127)) ;
         *(uint8_t *)dst = value;
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_i8_sint_fetch_signed(int *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
         uint8_t value = *(const uint8_t *)src;
         int8_t rgba;
         rgba = (int8_t)(value) ;
         dst[0] = (int)rgba; /* r */
         dst[1] = (int)rgba; /* g */
         dst[2] = (int)rgba; /* b */
         dst[3] = (int)rgba; /* a */
}

static inline void
util_format_i8_sint_unpack_unsigned(unsigned *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      unsigned *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = *(const uint8_t *)src;
         int8_t rgba;
         rgba = (int8_t)(value) ;
         dst[0] = (unsigned)MAX2(rgba, 0); /* r */
         dst[1] = (unsigned)MAX2(rgba, 0); /* g */
         dst[2] = (unsigned)MAX2(rgba, 0); /* b */
         dst[3] = (unsigned)MAX2(rgba, 0); /* a */
         src += 1;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_i8_sint_pack_unsigned(uint8_t *dst_row, unsigned dst_stride, const unsigned *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = 0;
         value |= (uint8_t)((int8_t)MIN2(src[0], 127)) ;
         *(uint8_t *)dst = value;
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_l8_sint_unpack_signed(int *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      int *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = *(const uint8_t *)src;
         int8_t rgb;
         rgb = (int8_t)(value) ;
         dst[0] = (int)rgb; /* r */
         dst[1] = (int)rgb; /* g */
         dst[2] = (int)rgb; /* b */
         dst[3] = 1; /* a */
         src += 1;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_l8_sint_pack_signed(uint8_t *dst_row, unsigned dst_stride, const int *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = 0;
         value |= (uint8_t)((int8_t)CLAMP(src[0], -128, 127)) ;
         *(uint8_t *)dst = value;
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_l8_sint_fetch_signed(int *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
         uint8_t value = *(const uint8_t *)src;
         int8_t rgb;
         rgb = (int8_t)(value) ;
         dst[0] = (int)rgb; /* r */
         dst[1] = (int)rgb; /* g */
         dst[2] = (int)rgb; /* b */
         dst[3] = 1; /* a */
}

static inline void
util_format_l8_sint_unpack_unsigned(unsigned *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      unsigned *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = *(const uint8_t *)src;
         int8_t rgb;
         rgb = (int8_t)(value) ;
         dst[0] = (unsigned)MAX2(rgb, 0); /* r */
         dst[1] = (unsigned)MAX2(rgb, 0); /* g */
         dst[2] = (unsigned)MAX2(rgb, 0); /* b */
         dst[3] = 1; /* a */
         src += 1;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_l8_sint_pack_unsigned(uint8_t *dst_row, unsigned dst_stride, const unsigned *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint8_t value = 0;
         value |= (uint8_t)((int8_t)MIN2(src[0], 127)) ;
         *(uint8_t *)dst = value;
         src += 4;
         dst += 1;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_l8a8_sint_unpack_signed(int *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      int *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         int16_t rgb;
         int16_t a;
         rgb = ((int16_t)(value) ) >> 8;
         a = ((int16_t)(value << 8) ) >> 8;
         dst[0] = (int)rgb; /* r */
         dst[1] = (int)rgb; /* g */
         dst[2] = (int)rgb; /* b */
         dst[3] = (int)a; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         int16_t rgb;
         int16_t a;
         rgb = ((int16_t)(value << 8) ) >> 8;
         a = ((int16_t)(value) ) >> 8;
         dst[0] = (int)rgb; /* r */
         dst[1] = (int)rgb; /* g */
         dst[2] = (int)rgb; /* b */
         dst[3] = (int)a; /* a */
#endif
         src += 2;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_l8a8_sint_pack_signed(uint8_t *dst_row, unsigned dst_stride, const int *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint16_t)((uint32_t)((int8_t)CLAMP(src[0], -128, 127)) << 8) ;
         value |= (uint16_t)(((int8_t)CLAMP(src[3], -128, 127)) & 0xff) ;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= (uint16_t)(((int8_t)CLAMP(src[0], -128, 127)) & 0xff) ;
         value |= (uint16_t)((uint32_t)((int8_t)CLAMP(src[3], -128, 127)) << 8) ;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_l8a8_sint_fetch_signed(int *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         int16_t rgb;
         int16_t a;
         rgb = ((int16_t)(value) ) >> 8;
         a = ((int16_t)(value << 8) ) >> 8;
         dst[0] = (int)rgb; /* r */
         dst[1] = (int)rgb; /* g */
         dst[2] = (int)rgb; /* b */
         dst[3] = (int)a; /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         int16_t rgb;
         int16_t a;
         rgb = ((int16_t)(value << 8) ) >> 8;
         a = ((int16_t)(value) ) >> 8;
         dst[0] = (int)rgb; /* r */
         dst[1] = (int)rgb; /* g */
         dst[2] = (int)rgb; /* b */
         dst[3] = (int)a; /* a */
#endif
}

static inline void
util_format_l8a8_sint_unpack_unsigned(unsigned *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      unsigned *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = *(const uint16_t *)src;
         int16_t rgb;
         int16_t a;
         rgb = ((int16_t)(value) ) >> 8;
         a = ((int16_t)(value << 8) ) >> 8;
         dst[0] = (unsigned)MAX2(rgb, 0); /* r */
         dst[1] = (unsigned)MAX2(rgb, 0); /* g */
         dst[2] = (unsigned)MAX2(rgb, 0); /* b */
         dst[3] = (unsigned)MAX2(a, 0); /* a */
#else
         uint16_t value = *(const uint16_t *)src;
         int16_t rgb;
         int16_t a;
         rgb = ((int16_t)(value << 8) ) >> 8;
         a = ((int16_t)(value) ) >> 8;
         dst[0] = (unsigned)MAX2(rgb, 0); /* r */
         dst[1] = (unsigned)MAX2(rgb, 0); /* g */
         dst[2] = (unsigned)MAX2(rgb, 0); /* b */
         dst[3] = (unsigned)MAX2(a, 0); /* a */
#endif
         src += 2;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_l8a8_sint_pack_unsigned(uint8_t *dst_row, unsigned dst_stride, const unsigned *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint16_t value = 0;
         value |= (uint16_t)((uint32_t)((int8_t)MIN2(src[0], 127)) << 8) ;
         value |= (uint16_t)(((int8_t)MIN2(src[3], 127)) & 0xff) ;
         *(uint16_t *)dst = value;
#else
         uint16_t value = 0;
         value |= (uint16_t)(((int8_t)MIN2(src[0], 127)) & 0xff) ;
         value |= (uint16_t)((uint32_t)((int8_t)MIN2(src[3], 127)) << 8) ;
         *(uint16_t *)dst = value;
#endif
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_a16_uint_unpack_unsigned(unsigned *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      unsigned *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = *(const uint16_t *)src;
         uint16_t a;
         a = value;
         dst[0] = 0; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = (unsigned)a; /* a */
         src += 2;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_a16_uint_pack_unsigned(uint8_t *dst_row, unsigned dst_stride, const unsigned *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = 0;
         value |= (uint16_t)MIN2(src[3], 65535);
         *(uint16_t *)dst = value;
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_a16_uint_fetch_unsigned(unsigned *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
         uint16_t value = *(const uint16_t *)src;
         uint16_t a;
         a = value;
         dst[0] = 0; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = (unsigned)a; /* a */
}

static inline void
util_format_a16_uint_unpack_signed(int *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      int *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = *(const uint16_t *)src;
         uint16_t a;
         a = value;
         dst[0] = 0; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = (int)a; /* a */
         src += 2;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_a16_uint_pack_signed(uint8_t *dst_row, unsigned dst_stride, const int *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = 0;
         value |= (uint16_t)CLAMP(src[3], 0, 65535);
         *(uint16_t *)dst = value;
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_i16_uint_unpack_unsigned(unsigned *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      unsigned *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = *(const uint16_t *)src;
         uint16_t rgba;
         rgba = value;
         dst[0] = (unsigned)rgba; /* r */
         dst[1] = (unsigned)rgba; /* g */
         dst[2] = (unsigned)rgba; /* b */
         dst[3] = (unsigned)rgba; /* a */
         src += 2;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_i16_uint_pack_unsigned(uint8_t *dst_row, unsigned dst_stride, const unsigned *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = 0;
         value |= (uint16_t)MIN2(src[0], 65535);
         *(uint16_t *)dst = value;
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_i16_uint_fetch_unsigned(unsigned *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
         uint16_t value = *(const uint16_t *)src;
         uint16_t rgba;
         rgba = value;
         dst[0] = (unsigned)rgba; /* r */
         dst[1] = (unsigned)rgba; /* g */
         dst[2] = (unsigned)rgba; /* b */
         dst[3] = (unsigned)rgba; /* a */
}

static inline void
util_format_i16_uint_unpack_signed(int *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      int *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = *(const uint16_t *)src;
         uint16_t rgba;
         rgba = value;
         dst[0] = (int)rgba; /* r */
         dst[1] = (int)rgba; /* g */
         dst[2] = (int)rgba; /* b */
         dst[3] = (int)rgba; /* a */
         src += 2;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_i16_uint_pack_signed(uint8_t *dst_row, unsigned dst_stride, const int *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = 0;
         value |= (uint16_t)CLAMP(src[0], 0, 65535);
         *(uint16_t *)dst = value;
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_l16_uint_unpack_unsigned(unsigned *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      unsigned *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = *(const uint16_t *)src;
         uint16_t rgb;
         rgb = value;
         dst[0] = (unsigned)rgb; /* r */
         dst[1] = (unsigned)rgb; /* g */
         dst[2] = (unsigned)rgb; /* b */
         dst[3] = 1; /* a */
         src += 2;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_l16_uint_pack_unsigned(uint8_t *dst_row, unsigned dst_stride, const unsigned *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = 0;
         value |= (uint16_t)MIN2(src[0], 65535);
         *(uint16_t *)dst = value;
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_l16_uint_fetch_unsigned(unsigned *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
         uint16_t value = *(const uint16_t *)src;
         uint16_t rgb;
         rgb = value;
         dst[0] = (unsigned)rgb; /* r */
         dst[1] = (unsigned)rgb; /* g */
         dst[2] = (unsigned)rgb; /* b */
         dst[3] = 1; /* a */
}

static inline void
util_format_l16_uint_unpack_signed(int *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      int *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = *(const uint16_t *)src;
         uint16_t rgb;
         rgb = value;
         dst[0] = (int)rgb; /* r */
         dst[1] = (int)rgb; /* g */
         dst[2] = (int)rgb; /* b */
         dst[3] = 1; /* a */
         src += 2;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_l16_uint_pack_signed(uint8_t *dst_row, unsigned dst_stride, const int *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = 0;
         value |= (uint16_t)CLAMP(src[0], 0, 65535);
         *(uint16_t *)dst = value;
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_l16a16_uint_unpack_unsigned(unsigned *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      unsigned *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t rgb;
         uint32_t a;
         rgb = value >> 16;
         a = (value) & 0xffff;
         dst[0] = (unsigned)rgb; /* r */
         dst[1] = (unsigned)rgb; /* g */
         dst[2] = (unsigned)rgb; /* b */
         dst[3] = (unsigned)a; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t rgb;
         uint32_t a;
         rgb = (value) & 0xffff;
         a = value >> 16;
         dst[0] = (unsigned)rgb; /* r */
         dst[1] = (unsigned)rgb; /* g */
         dst[2] = (unsigned)rgb; /* b */
         dst[3] = (unsigned)a; /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_l16a16_uint_pack_unsigned(uint8_t *dst_row, unsigned dst_stride, const unsigned *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint16_t)MIN2(src[0], 65535)) << 16;
         value |= ((uint16_t)MIN2(src[3], 65535)) & 0xffff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= ((uint16_t)MIN2(src[0], 65535)) & 0xffff;
         value |= (uint32_t)((uint16_t)MIN2(src[3], 65535)) << 16;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_l16a16_uint_fetch_unsigned(unsigned *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t rgb;
         uint32_t a;
         rgb = value >> 16;
         a = (value) & 0xffff;
         dst[0] = (unsigned)rgb; /* r */
         dst[1] = (unsigned)rgb; /* g */
         dst[2] = (unsigned)rgb; /* b */
         dst[3] = (unsigned)a; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t rgb;
         uint32_t a;
         rgb = (value) & 0xffff;
         a = value >> 16;
         dst[0] = (unsigned)rgb; /* r */
         dst[1] = (unsigned)rgb; /* g */
         dst[2] = (unsigned)rgb; /* b */
         dst[3] = (unsigned)a; /* a */
#endif
}

static inline void
util_format_l16a16_uint_unpack_signed(int *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      int *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t rgb;
         uint32_t a;
         rgb = value >> 16;
         a = (value) & 0xffff;
         dst[0] = (int)rgb; /* r */
         dst[1] = (int)rgb; /* g */
         dst[2] = (int)rgb; /* b */
         dst[3] = (int)a; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t rgb;
         uint32_t a;
         rgb = (value) & 0xffff;
         a = value >> 16;
         dst[0] = (int)rgb; /* r */
         dst[1] = (int)rgb; /* g */
         dst[2] = (int)rgb; /* b */
         dst[3] = (int)a; /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_l16a16_uint_pack_signed(uint8_t *dst_row, unsigned dst_stride, const int *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint16_t)CLAMP(src[0], 0, 65535)) << 16;
         value |= ((uint16_t)CLAMP(src[3], 0, 65535)) & 0xffff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= ((uint16_t)CLAMP(src[0], 0, 65535)) & 0xffff;
         value |= (uint32_t)((uint16_t)CLAMP(src[3], 0, 65535)) << 16;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_a16_sint_unpack_signed(int *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      int *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = *(const uint16_t *)src;
         int16_t a;
         a = (int16_t)(value) ;
         dst[0] = 0; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = (int)a; /* a */
         src += 2;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_a16_sint_pack_signed(uint8_t *dst_row, unsigned dst_stride, const int *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = 0;
         value |= (uint16_t)((int16_t)CLAMP(src[3], -32768, 32767)) ;
         *(uint16_t *)dst = value;
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_a16_sint_fetch_signed(int *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
         uint16_t value = *(const uint16_t *)src;
         int16_t a;
         a = (int16_t)(value) ;
         dst[0] = 0; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = (int)a; /* a */
}

static inline void
util_format_a16_sint_unpack_unsigned(unsigned *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      unsigned *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = *(const uint16_t *)src;
         int16_t a;
         a = (int16_t)(value) ;
         dst[0] = 0; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = (unsigned)MAX2(a, 0); /* a */
         src += 2;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_a16_sint_pack_unsigned(uint8_t *dst_row, unsigned dst_stride, const unsigned *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = 0;
         value |= (uint16_t)((int16_t)MIN2(src[3], 32767)) ;
         *(uint16_t *)dst = value;
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_i16_sint_unpack_signed(int *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      int *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = *(const uint16_t *)src;
         int16_t rgba;
         rgba = (int16_t)(value) ;
         dst[0] = (int)rgba; /* r */
         dst[1] = (int)rgba; /* g */
         dst[2] = (int)rgba; /* b */
         dst[3] = (int)rgba; /* a */
         src += 2;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_i16_sint_pack_signed(uint8_t *dst_row, unsigned dst_stride, const int *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = 0;
         value |= (uint16_t)((int16_t)CLAMP(src[0], -32768, 32767)) ;
         *(uint16_t *)dst = value;
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_i16_sint_fetch_signed(int *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
         uint16_t value = *(const uint16_t *)src;
         int16_t rgba;
         rgba = (int16_t)(value) ;
         dst[0] = (int)rgba; /* r */
         dst[1] = (int)rgba; /* g */
         dst[2] = (int)rgba; /* b */
         dst[3] = (int)rgba; /* a */
}

static inline void
util_format_i16_sint_unpack_unsigned(unsigned *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      unsigned *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = *(const uint16_t *)src;
         int16_t rgba;
         rgba = (int16_t)(value) ;
         dst[0] = (unsigned)MAX2(rgba, 0); /* r */
         dst[1] = (unsigned)MAX2(rgba, 0); /* g */
         dst[2] = (unsigned)MAX2(rgba, 0); /* b */
         dst[3] = (unsigned)MAX2(rgba, 0); /* a */
         src += 2;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_i16_sint_pack_unsigned(uint8_t *dst_row, unsigned dst_stride, const unsigned *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = 0;
         value |= (uint16_t)((int16_t)MIN2(src[0], 32767)) ;
         *(uint16_t *)dst = value;
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_l16_sint_unpack_signed(int *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      int *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = *(const uint16_t *)src;
         int16_t rgb;
         rgb = (int16_t)(value) ;
         dst[0] = (int)rgb; /* r */
         dst[1] = (int)rgb; /* g */
         dst[2] = (int)rgb; /* b */
         dst[3] = 1; /* a */
         src += 2;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_l16_sint_pack_signed(uint8_t *dst_row, unsigned dst_stride, const int *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = 0;
         value |= (uint16_t)((int16_t)CLAMP(src[0], -32768, 32767)) ;
         *(uint16_t *)dst = value;
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_l16_sint_fetch_signed(int *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
         uint16_t value = *(const uint16_t *)src;
         int16_t rgb;
         rgb = (int16_t)(value) ;
         dst[0] = (int)rgb; /* r */
         dst[1] = (int)rgb; /* g */
         dst[2] = (int)rgb; /* b */
         dst[3] = 1; /* a */
}

static inline void
util_format_l16_sint_unpack_unsigned(unsigned *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      unsigned *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = *(const uint16_t *)src;
         int16_t rgb;
         rgb = (int16_t)(value) ;
         dst[0] = (unsigned)MAX2(rgb, 0); /* r */
         dst[1] = (unsigned)MAX2(rgb, 0); /* g */
         dst[2] = (unsigned)MAX2(rgb, 0); /* b */
         dst[3] = 1; /* a */
         src += 2;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_l16_sint_pack_unsigned(uint8_t *dst_row, unsigned dst_stride, const unsigned *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint16_t value = 0;
         value |= (uint16_t)((int16_t)MIN2(src[0], 32767)) ;
         *(uint16_t *)dst = value;
         src += 4;
         dst += 2;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_l16a16_sint_unpack_signed(int *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      int *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t rgb;
         int32_t a;
         rgb = ((int32_t)(value) ) >> 16;
         a = ((int32_t)(value << 16) ) >> 16;
         dst[0] = (int)rgb; /* r */
         dst[1] = (int)rgb; /* g */
         dst[2] = (int)rgb; /* b */
         dst[3] = (int)a; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t rgb;
         int32_t a;
         rgb = ((int32_t)(value << 16) ) >> 16;
         a = ((int32_t)(value) ) >> 16;
         dst[0] = (int)rgb; /* r */
         dst[1] = (int)rgb; /* g */
         dst[2] = (int)rgb; /* b */
         dst[3] = (int)a; /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_l16a16_sint_pack_signed(uint8_t *dst_row, unsigned dst_stride, const int *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)((int16_t)CLAMP(src[0], -32768, 32767)) << 16) ;
         value |= (uint32_t)(((int16_t)CLAMP(src[3], -32768, 32767)) & 0xffff) ;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (uint32_t)(((int16_t)CLAMP(src[0], -32768, 32767)) & 0xffff) ;
         value |= (uint32_t)((uint32_t)((int16_t)CLAMP(src[3], -32768, 32767)) << 16) ;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_l16a16_sint_fetch_signed(int *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t rgb;
         int32_t a;
         rgb = ((int32_t)(value) ) >> 16;
         a = ((int32_t)(value << 16) ) >> 16;
         dst[0] = (int)rgb; /* r */
         dst[1] = (int)rgb; /* g */
         dst[2] = (int)rgb; /* b */
         dst[3] = (int)a; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t rgb;
         int32_t a;
         rgb = ((int32_t)(value << 16) ) >> 16;
         a = ((int32_t)(value) ) >> 16;
         dst[0] = (int)rgb; /* r */
         dst[1] = (int)rgb; /* g */
         dst[2] = (int)rgb; /* b */
         dst[3] = (int)a; /* a */
#endif
}

static inline void
util_format_l16a16_sint_unpack_unsigned(unsigned *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      unsigned *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t rgb;
         int32_t a;
         rgb = ((int32_t)(value) ) >> 16;
         a = ((int32_t)(value << 16) ) >> 16;
         dst[0] = (unsigned)MAX2(rgb, 0); /* r */
         dst[1] = (unsigned)MAX2(rgb, 0); /* g */
         dst[2] = (unsigned)MAX2(rgb, 0); /* b */
         dst[3] = (unsigned)MAX2(a, 0); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t rgb;
         int32_t a;
         rgb = ((int32_t)(value << 16) ) >> 16;
         a = ((int32_t)(value) ) >> 16;
         dst[0] = (unsigned)MAX2(rgb, 0); /* r */
         dst[1] = (unsigned)MAX2(rgb, 0); /* g */
         dst[2] = (unsigned)MAX2(rgb, 0); /* b */
         dst[3] = (unsigned)MAX2(a, 0); /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_l16a16_sint_pack_unsigned(uint8_t *dst_row, unsigned dst_stride, const unsigned *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)((int16_t)MIN2(src[0], 32767)) << 16) ;
         value |= (uint32_t)(((int16_t)MIN2(src[3], 32767)) & 0xffff) ;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (uint32_t)(((int16_t)MIN2(src[0], 32767)) & 0xffff) ;
         value |= (uint32_t)((uint32_t)((int16_t)MIN2(src[3], 32767)) << 16) ;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_a32_uint_unpack_unsigned(unsigned *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      unsigned *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         uint32_t value = *(const uint32_t *)src;
         uint32_t a;
         a = value;
         dst[0] = 0; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = a; /* a */
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_a32_uint_pack_unsigned(uint8_t *dst_row, unsigned dst_stride, const unsigned *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint32_t value = 0;
         value |= src[3];
         *(uint32_t *)dst = value;
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_a32_uint_fetch_unsigned(unsigned *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
         uint32_t value = *(const uint32_t *)src;
         uint32_t a;
         a = value;
         dst[0] = 0; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = a; /* a */
}

static inline void
util_format_a32_uint_unpack_signed(int *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      int *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         uint32_t value = *(const uint32_t *)src;
         uint32_t a;
         a = value;
         dst[0] = 0; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = (int)MIN2(a, 2147483647); /* a */
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_a32_uint_pack_signed(uint8_t *dst_row, unsigned dst_stride, const int *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint32_t value = 0;
         value |= (uint32_t)MAX2(src[3], 0);
         *(uint32_t *)dst = value;
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_i32_uint_unpack_unsigned(unsigned *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      unsigned *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         uint32_t value = *(const uint32_t *)src;
         uint32_t rgba;
         rgba = value;
         dst[0] = rgba; /* r */
         dst[1] = rgba; /* g */
         dst[2] = rgba; /* b */
         dst[3] = rgba; /* a */
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_i32_uint_pack_unsigned(uint8_t *dst_row, unsigned dst_stride, const unsigned *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint32_t value = 0;
         value |= src[0];
         *(uint32_t *)dst = value;
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_i32_uint_fetch_unsigned(unsigned *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
         uint32_t value = *(const uint32_t *)src;
         uint32_t rgba;
         rgba = value;
         dst[0] = rgba; /* r */
         dst[1] = rgba; /* g */
         dst[2] = rgba; /* b */
         dst[3] = rgba; /* a */
}

static inline void
util_format_i32_uint_unpack_signed(int *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      int *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         uint32_t value = *(const uint32_t *)src;
         uint32_t rgba;
         rgba = value;
         dst[0] = (int)MIN2(rgba, 2147483647); /* r */
         dst[1] = (int)MIN2(rgba, 2147483647); /* g */
         dst[2] = (int)MIN2(rgba, 2147483647); /* b */
         dst[3] = (int)MIN2(rgba, 2147483647); /* a */
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_i32_uint_pack_signed(uint8_t *dst_row, unsigned dst_stride, const int *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint32_t value = 0;
         value |= (uint32_t)MAX2(src[0], 0);
         *(uint32_t *)dst = value;
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_l32_uint_unpack_unsigned(unsigned *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      unsigned *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         uint32_t value = *(const uint32_t *)src;
         uint32_t rgb;
         rgb = value;
         dst[0] = rgb; /* r */
         dst[1] = rgb; /* g */
         dst[2] = rgb; /* b */
         dst[3] = 1; /* a */
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_l32_uint_pack_unsigned(uint8_t *dst_row, unsigned dst_stride, const unsigned *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint32_t value = 0;
         value |= src[0];
         *(uint32_t *)dst = value;
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_l32_uint_fetch_unsigned(unsigned *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
         uint32_t value = *(const uint32_t *)src;
         uint32_t rgb;
         rgb = value;
         dst[0] = rgb; /* r */
         dst[1] = rgb; /* g */
         dst[2] = rgb; /* b */
         dst[3] = 1; /* a */
}

static inline void
util_format_l32_uint_unpack_signed(int *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      int *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         uint32_t value = *(const uint32_t *)src;
         uint32_t rgb;
         rgb = value;
         dst[0] = (int)MIN2(rgb, 2147483647); /* r */
         dst[1] = (int)MIN2(rgb, 2147483647); /* g */
         dst[2] = (int)MIN2(rgb, 2147483647); /* b */
         dst[3] = 1; /* a */
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_l32_uint_pack_signed(uint8_t *dst_row, unsigned dst_stride, const int *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint32_t value = 0;
         value |= (uint32_t)MAX2(src[0], 0);
         *(uint32_t *)dst = value;
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_l32a32_uint {
#if UTIL_ARCH_BIG_ENDIAN
   uint32_t rgb;
   uint32_t a;
#else
   uint32_t rgb;
   uint32_t a;
#endif
};

static inline void
util_format_l32a32_uint_unpack_unsigned(unsigned *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      unsigned *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_l32a32_uint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.rgb; /* r */
         dst[1] = pixel.rgb; /* g */
         dst[2] = pixel.rgb; /* b */
         dst[3] = pixel.a; /* a */
#else
         struct util_format_l32a32_uint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.rgb; /* r */
         dst[1] = pixel.rgb; /* g */
         dst[2] = pixel.rgb; /* b */
         dst[3] = pixel.a; /* a */
#endif
         src += 8;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_l32a32_uint_pack_unsigned(uint8_t *dst_row, unsigned dst_stride, const unsigned *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_l32a32_uint pixel;
         pixel.rgb = src[0];
         pixel.a = src[3];
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_l32a32_uint pixel;
         pixel.rgb = src[0];
         pixel.a = src[3];
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 8;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_l32a32_uint_fetch_unsigned(unsigned *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_l32a32_uint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.rgb; /* r */
         dst[1] = pixel.rgb; /* g */
         dst[2] = pixel.rgb; /* b */
         dst[3] = pixel.a; /* a */
#else
         struct util_format_l32a32_uint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.rgb; /* r */
         dst[1] = pixel.rgb; /* g */
         dst[2] = pixel.rgb; /* b */
         dst[3] = pixel.a; /* a */
#endif
}

static inline void
util_format_l32a32_uint_unpack_signed(int *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      int *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_l32a32_uint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (int)MIN2(pixel.rgb, 2147483647); /* r */
         dst[1] = (int)MIN2(pixel.rgb, 2147483647); /* g */
         dst[2] = (int)MIN2(pixel.rgb, 2147483647); /* b */
         dst[3] = (int)MIN2(pixel.a, 2147483647); /* a */
#else
         struct util_format_l32a32_uint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (int)MIN2(pixel.rgb, 2147483647); /* r */
         dst[1] = (int)MIN2(pixel.rgb, 2147483647); /* g */
         dst[2] = (int)MIN2(pixel.rgb, 2147483647); /* b */
         dst[3] = (int)MIN2(pixel.a, 2147483647); /* a */
#endif
         src += 8;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_l32a32_uint_pack_signed(uint8_t *dst_row, unsigned dst_stride, const int *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_l32a32_uint pixel;
         pixel.rgb = (uint32_t)MAX2(src[0], 0);
         pixel.a = (uint32_t)MAX2(src[3], 0);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_l32a32_uint pixel;
         pixel.rgb = (uint32_t)MAX2(src[0], 0);
         pixel.a = (uint32_t)MAX2(src[3], 0);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 8;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_a32_sint_unpack_signed(int *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      int *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         uint32_t value = *(const uint32_t *)src;
         int32_t a;
         a = (int32_t)(value) ;
         dst[0] = 0; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = a; /* a */
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_a32_sint_pack_signed(uint8_t *dst_row, unsigned dst_stride, const int *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint32_t value = 0;
         value |= (uint32_t)(src[3]) ;
         *(uint32_t *)dst = value;
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_a32_sint_fetch_signed(int *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
         uint32_t value = *(const uint32_t *)src;
         int32_t a;
         a = (int32_t)(value) ;
         dst[0] = 0; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = a; /* a */
}

static inline void
util_format_a32_sint_unpack_unsigned(unsigned *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      unsigned *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         uint32_t value = *(const uint32_t *)src;
         int32_t a;
         a = (int32_t)(value) ;
         dst[0] = 0; /* r */
         dst[1] = 0; /* g */
         dst[2] = 0; /* b */
         dst[3] = (unsigned)MAX2(a, 0); /* a */
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_a32_sint_pack_unsigned(uint8_t *dst_row, unsigned dst_stride, const unsigned *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint32_t value = 0;
         value |= (uint32_t)((int32_t)MIN2(src[3], 2147483647)) ;
         *(uint32_t *)dst = value;
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_i32_sint_unpack_signed(int *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      int *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         uint32_t value = *(const uint32_t *)src;
         int32_t rgba;
         rgba = (int32_t)(value) ;
         dst[0] = rgba; /* r */
         dst[1] = rgba; /* g */
         dst[2] = rgba; /* b */
         dst[3] = rgba; /* a */
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_i32_sint_pack_signed(uint8_t *dst_row, unsigned dst_stride, const int *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint32_t value = 0;
         value |= (uint32_t)(src[0]) ;
         *(uint32_t *)dst = value;
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_i32_sint_fetch_signed(int *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
         uint32_t value = *(const uint32_t *)src;
         int32_t rgba;
         rgba = (int32_t)(value) ;
         dst[0] = rgba; /* r */
         dst[1] = rgba; /* g */
         dst[2] = rgba; /* b */
         dst[3] = rgba; /* a */
}

static inline void
util_format_i32_sint_unpack_unsigned(unsigned *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      unsigned *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         uint32_t value = *(const uint32_t *)src;
         int32_t rgba;
         rgba = (int32_t)(value) ;
         dst[0] = (unsigned)MAX2(rgba, 0); /* r */
         dst[1] = (unsigned)MAX2(rgba, 0); /* g */
         dst[2] = (unsigned)MAX2(rgba, 0); /* b */
         dst[3] = (unsigned)MAX2(rgba, 0); /* a */
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_i32_sint_pack_unsigned(uint8_t *dst_row, unsigned dst_stride, const unsigned *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint32_t value = 0;
         value |= (uint32_t)((int32_t)MIN2(src[0], 2147483647)) ;
         *(uint32_t *)dst = value;
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_l32_sint_unpack_signed(int *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      int *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         uint32_t value = *(const uint32_t *)src;
         int32_t rgb;
         rgb = (int32_t)(value) ;
         dst[0] = rgb; /* r */
         dst[1] = rgb; /* g */
         dst[2] = rgb; /* b */
         dst[3] = 1; /* a */
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_l32_sint_pack_signed(uint8_t *dst_row, unsigned dst_stride, const int *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint32_t value = 0;
         value |= (uint32_t)(src[0]) ;
         *(uint32_t *)dst = value;
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_l32_sint_fetch_signed(int *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
         uint32_t value = *(const uint32_t *)src;
         int32_t rgb;
         rgb = (int32_t)(value) ;
         dst[0] = rgb; /* r */
         dst[1] = rgb; /* g */
         dst[2] = rgb; /* b */
         dst[3] = 1; /* a */
}

static inline void
util_format_l32_sint_unpack_unsigned(unsigned *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      unsigned *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
         uint32_t value = *(const uint32_t *)src;
         int32_t rgb;
         rgb = (int32_t)(value) ;
         dst[0] = (unsigned)MAX2(rgb, 0); /* r */
         dst[1] = (unsigned)MAX2(rgb, 0); /* g */
         dst[2] = (unsigned)MAX2(rgb, 0); /* b */
         dst[3] = 1; /* a */
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_l32_sint_pack_unsigned(uint8_t *dst_row, unsigned dst_stride, const unsigned *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
         uint32_t value = 0;
         value |= (uint32_t)((int32_t)MIN2(src[0], 2147483647)) ;
         *(uint32_t *)dst = value;
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_l32a32_sint {
#if UTIL_ARCH_BIG_ENDIAN
   int32_t rgb;
   int32_t a;
#else
   int32_t rgb;
   int32_t a;
#endif
};

static inline void
util_format_l32a32_sint_unpack_signed(int *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      int *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_l32a32_sint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.rgb; /* r */
         dst[1] = pixel.rgb; /* g */
         dst[2] = pixel.rgb; /* b */
         dst[3] = pixel.a; /* a */
#else
         struct util_format_l32a32_sint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.rgb; /* r */
         dst[1] = pixel.rgb; /* g */
         dst[2] = pixel.rgb; /* b */
         dst[3] = pixel.a; /* a */
#endif
         src += 8;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_l32a32_sint_pack_signed(uint8_t *dst_row, unsigned dst_stride, const int *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_l32a32_sint pixel;
         pixel.rgb = src[0];
         pixel.a = src[3];
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_l32a32_sint pixel;
         pixel.rgb = src[0];
         pixel.a = src[3];
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 8;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_l32a32_sint_fetch_signed(int *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_l32a32_sint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.rgb; /* r */
         dst[1] = pixel.rgb; /* g */
         dst[2] = pixel.rgb; /* b */
         dst[3] = pixel.a; /* a */
#else
         struct util_format_l32a32_sint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = pixel.rgb; /* r */
         dst[1] = pixel.rgb; /* g */
         dst[2] = pixel.rgb; /* b */
         dst[3] = pixel.a; /* a */
#endif
}

static inline void
util_format_l32a32_sint_unpack_unsigned(unsigned *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      unsigned *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_l32a32_sint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (unsigned)MAX2(pixel.rgb, 0); /* r */
         dst[1] = (unsigned)MAX2(pixel.rgb, 0); /* g */
         dst[2] = (unsigned)MAX2(pixel.rgb, 0); /* b */
         dst[3] = (unsigned)MAX2(pixel.a, 0); /* a */
#else
         struct util_format_l32a32_sint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (unsigned)MAX2(pixel.rgb, 0); /* r */
         dst[1] = (unsigned)MAX2(pixel.rgb, 0); /* g */
         dst[2] = (unsigned)MAX2(pixel.rgb, 0); /* b */
         dst[3] = (unsigned)MAX2(pixel.a, 0); /* a */
#endif
         src += 8;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_l32a32_sint_pack_unsigned(uint8_t *dst_row, unsigned dst_stride, const unsigned *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_l32a32_sint pixel;
         pixel.rgb = (int32_t)MIN2(src[0], 2147483647);
         pixel.a = (int32_t)MIN2(src[3], 2147483647);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_l32a32_sint pixel;
         pixel.rgb = (int32_t)MIN2(src[0], 2147483647);
         pixel.a = (int32_t)MIN2(src[3], 2147483647);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 8;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_b8g8r8_uint {
#if UTIL_ARCH_BIG_ENDIAN
   uint8_t b;
   uint8_t g;
   uint8_t r;
#else
   uint8_t b;
   uint8_t g;
   uint8_t r;
#endif
};

static inline void
util_format_b8g8r8_uint_unpack_unsigned(unsigned *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      unsigned *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_b8g8r8_uint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (unsigned)pixel.r; /* r */
         dst[1] = (unsigned)pixel.g; /* g */
         dst[2] = (unsigned)pixel.b; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_b8g8r8_uint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (unsigned)pixel.r; /* r */
         dst[1] = (unsigned)pixel.g; /* g */
         dst[2] = (unsigned)pixel.b; /* b */
         dst[3] = 1; /* a */
#endif
         src += 3;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_b8g8r8_uint_pack_unsigned(uint8_t *dst_row, unsigned dst_stride, const unsigned *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_b8g8r8_uint pixel;
         pixel.b = (uint8_t)MIN2(src[2], 255);
         pixel.g = (uint8_t)MIN2(src[1], 255);
         pixel.r = (uint8_t)MIN2(src[0], 255);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_b8g8r8_uint pixel;
         pixel.b = (uint8_t)MIN2(src[2], 255);
         pixel.g = (uint8_t)MIN2(src[1], 255);
         pixel.r = (uint8_t)MIN2(src[0], 255);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 3;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_b8g8r8_uint_fetch_unsigned(unsigned *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_b8g8r8_uint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (unsigned)pixel.r; /* r */
         dst[1] = (unsigned)pixel.g; /* g */
         dst[2] = (unsigned)pixel.b; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_b8g8r8_uint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (unsigned)pixel.r; /* r */
         dst[1] = (unsigned)pixel.g; /* g */
         dst[2] = (unsigned)pixel.b; /* b */
         dst[3] = 1; /* a */
#endif
}

static inline void
util_format_b8g8r8_uint_unpack_signed(int *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      int *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_b8g8r8_uint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (int)pixel.r; /* r */
         dst[1] = (int)pixel.g; /* g */
         dst[2] = (int)pixel.b; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_b8g8r8_uint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (int)pixel.r; /* r */
         dst[1] = (int)pixel.g; /* g */
         dst[2] = (int)pixel.b; /* b */
         dst[3] = 1; /* a */
#endif
         src += 3;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_b8g8r8_uint_pack_signed(uint8_t *dst_row, unsigned dst_stride, const int *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_b8g8r8_uint pixel;
         pixel.b = (uint8_t)CLAMP(src[2], 0, 255);
         pixel.g = (uint8_t)CLAMP(src[1], 0, 255);
         pixel.r = (uint8_t)CLAMP(src[0], 0, 255);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_b8g8r8_uint pixel;
         pixel.b = (uint8_t)CLAMP(src[2], 0, 255);
         pixel.g = (uint8_t)CLAMP(src[1], 0, 255);
         pixel.r = (uint8_t)CLAMP(src[0], 0, 255);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 3;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_b8g8r8a8_uint_unpack_unsigned(unsigned *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      unsigned *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t b;
         uint32_t g;
         uint32_t r;
         uint32_t a;
         b = value >> 24;
         g = (value >> 16) & 0xff;
         r = (value >> 8) & 0xff;
         a = (value) & 0xff;
         dst[0] = (unsigned)r; /* r */
         dst[1] = (unsigned)g; /* g */
         dst[2] = (unsigned)b; /* b */
         dst[3] = (unsigned)a; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t b;
         uint32_t g;
         uint32_t r;
         uint32_t a;
         b = (value) & 0xff;
         g = (value >> 8) & 0xff;
         r = (value >> 16) & 0xff;
         a = value >> 24;
         dst[0] = (unsigned)r; /* r */
         dst[1] = (unsigned)g; /* g */
         dst[2] = (unsigned)b; /* b */
         dst[3] = (unsigned)a; /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_b8g8r8a8_uint_pack_unsigned(uint8_t *dst_row, unsigned dst_stride, const unsigned *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint8_t)MIN2(src[2], 255)) << 24;
         value |= (uint32_t)(((uint8_t)MIN2(src[1], 255)) & 0xff) << 16;
         value |= (uint32_t)(((uint8_t)MIN2(src[0], 255)) & 0xff) << 8;
         value |= ((uint8_t)MIN2(src[3], 255)) & 0xff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= ((uint8_t)MIN2(src[2], 255)) & 0xff;
         value |= (uint32_t)(((uint8_t)MIN2(src[1], 255)) & 0xff) << 8;
         value |= (uint32_t)(((uint8_t)MIN2(src[0], 255)) & 0xff) << 16;
         value |= (uint32_t)((uint8_t)MIN2(src[3], 255)) << 24;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_b8g8r8a8_uint_fetch_unsigned(unsigned *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t b;
         uint32_t g;
         uint32_t r;
         uint32_t a;
         b = value >> 24;
         g = (value >> 16) & 0xff;
         r = (value >> 8) & 0xff;
         a = (value) & 0xff;
         dst[0] = (unsigned)r; /* r */
         dst[1] = (unsigned)g; /* g */
         dst[2] = (unsigned)b; /* b */
         dst[3] = (unsigned)a; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t b;
         uint32_t g;
         uint32_t r;
         uint32_t a;
         b = (value) & 0xff;
         g = (value >> 8) & 0xff;
         r = (value >> 16) & 0xff;
         a = value >> 24;
         dst[0] = (unsigned)r; /* r */
         dst[1] = (unsigned)g; /* g */
         dst[2] = (unsigned)b; /* b */
         dst[3] = (unsigned)a; /* a */
#endif
}

static inline void
util_format_b8g8r8a8_uint_unpack_signed(int *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      int *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         uint32_t b;
         uint32_t g;
         uint32_t r;
         uint32_t a;
         b = value >> 24;
         g = (value >> 16) & 0xff;
         r = (value >> 8) & 0xff;
         a = (value) & 0xff;
         dst[0] = (int)r; /* r */
         dst[1] = (int)g; /* g */
         dst[2] = (int)b; /* b */
         dst[3] = (int)a; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         uint32_t b;
         uint32_t g;
         uint32_t r;
         uint32_t a;
         b = (value) & 0xff;
         g = (value >> 8) & 0xff;
         r = (value >> 16) & 0xff;
         a = value >> 24;
         dst[0] = (int)r; /* r */
         dst[1] = (int)g; /* g */
         dst[2] = (int)b; /* b */
         dst[3] = (int)a; /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_b8g8r8a8_uint_pack_signed(uint8_t *dst_row, unsigned dst_stride, const int *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint8_t)CLAMP(src[2], 0, 255)) << 24;
         value |= (uint32_t)(((uint8_t)CLAMP(src[1], 0, 255)) & 0xff) << 16;
         value |= (uint32_t)(((uint8_t)CLAMP(src[0], 0, 255)) & 0xff) << 8;
         value |= ((uint8_t)CLAMP(src[3], 0, 255)) & 0xff;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= ((uint8_t)CLAMP(src[2], 0, 255)) & 0xff;
         value |= (uint32_t)(((uint8_t)CLAMP(src[1], 0, 255)) & 0xff) << 8;
         value |= (uint32_t)(((uint8_t)CLAMP(src[0], 0, 255)) & 0xff) << 16;
         value |= (uint32_t)((uint8_t)CLAMP(src[3], 0, 255)) << 24;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

struct util_format_b8g8r8_sint {
#if UTIL_ARCH_BIG_ENDIAN
   int8_t b;
   int8_t g;
   int8_t r;
#else
   int8_t b;
   int8_t g;
   int8_t r;
#endif
};

static inline void
util_format_b8g8r8_sint_unpack_signed(int *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      int *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_b8g8r8_sint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (int)pixel.r; /* r */
         dst[1] = (int)pixel.g; /* g */
         dst[2] = (int)pixel.b; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_b8g8r8_sint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (int)pixel.r; /* r */
         dst[1] = (int)pixel.g; /* g */
         dst[2] = (int)pixel.b; /* b */
         dst[3] = 1; /* a */
#endif
         src += 3;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_b8g8r8_sint_pack_signed(uint8_t *dst_row, unsigned dst_stride, const int *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_b8g8r8_sint pixel;
         pixel.b = (int8_t)CLAMP(src[2], -128, 127);
         pixel.g = (int8_t)CLAMP(src[1], -128, 127);
         pixel.r = (int8_t)CLAMP(src[0], -128, 127);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_b8g8r8_sint pixel;
         pixel.b = (int8_t)CLAMP(src[2], -128, 127);
         pixel.g = (int8_t)CLAMP(src[1], -128, 127);
         pixel.r = (int8_t)CLAMP(src[0], -128, 127);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 3;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_b8g8r8_sint_fetch_signed(int *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_b8g8r8_sint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (int)pixel.r; /* r */
         dst[1] = (int)pixel.g; /* g */
         dst[2] = (int)pixel.b; /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_b8g8r8_sint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (int)pixel.r; /* r */
         dst[1] = (int)pixel.g; /* g */
         dst[2] = (int)pixel.b; /* b */
         dst[3] = 1; /* a */
#endif
}

static inline void
util_format_b8g8r8_sint_unpack_unsigned(unsigned *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      unsigned *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_b8g8r8_sint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (unsigned)MAX2(pixel.r, 0); /* r */
         dst[1] = (unsigned)MAX2(pixel.g, 0); /* g */
         dst[2] = (unsigned)MAX2(pixel.b, 0); /* b */
         dst[3] = 1; /* a */
#else
         struct util_format_b8g8r8_sint pixel;
         memcpy(&pixel, src, sizeof pixel);
         dst[0] = (unsigned)MAX2(pixel.r, 0); /* r */
         dst[1] = (unsigned)MAX2(pixel.g, 0); /* g */
         dst[2] = (unsigned)MAX2(pixel.b, 0); /* b */
         dst[3] = 1; /* a */
#endif
         src += 3;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_b8g8r8_sint_pack_unsigned(uint8_t *dst_row, unsigned dst_stride, const unsigned *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const unsigned *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         struct util_format_b8g8r8_sint pixel;
         pixel.b = (int8_t)MIN2(src[2], 127);
         pixel.g = (int8_t)MIN2(src[1], 127);
         pixel.r = (int8_t)MIN2(src[0], 127);
         memcpy(dst, &pixel, sizeof pixel);
#else
         struct util_format_b8g8r8_sint pixel;
         pixel.b = (int8_t)MIN2(src[2], 127);
         pixel.g = (int8_t)MIN2(src[1], 127);
         pixel.r = (int8_t)MIN2(src[0], 127);
         memcpy(dst, &pixel, sizeof pixel);
#endif
         src += 4;
         dst += 3;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_b8g8r8a8_sint_unpack_signed(int *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      int *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t b;
         int32_t g;
         int32_t r;
         int32_t a;
         b = ((int32_t)(value) ) >> 24;
         g = ((int32_t)(value << 8) ) >> 24;
         r = ((int32_t)(value << 16) ) >> 24;
         a = ((int32_t)(value << 24) ) >> 24;
         dst[0] = (int)r; /* r */
         dst[1] = (int)g; /* g */
         dst[2] = (int)b; /* b */
         dst[3] = (int)a; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t b;
         int32_t g;
         int32_t r;
         int32_t a;
         b = ((int32_t)(value << 24) ) >> 24;
         g = ((int32_t)(value << 16) ) >> 24;
         r = ((int32_t)(value << 8) ) >> 24;
         a = ((int32_t)(value) ) >> 24;
         dst[0] = (int)r; /* r */
         dst[1] = (int)g; /* g */
         dst[2] = (int)b; /* b */
         dst[3] = (int)a; /* a */
#endif
         src += 4;
         dst += 4;
      }
      src_row += src_stride;
      dst_row += dst_stride/sizeof(*dst_row);
   }
}

static inline void
util_format_b8g8r8a8_sint_pack_signed(uint8_t *dst_row, unsigned dst_stride, const int *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      const int *src = src_row;
      uint8_t *dst = dst_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = 0;
         value |= (uint32_t)((uint32_t)((int8_t)CLAMP(src[2], -128, 127)) << 24) ;
         value |= (uint32_t)((uint32_t)(((int8_t)CLAMP(src[1], -128, 127)) & 0xff) << 16) ;
         value |= (uint32_t)((uint32_t)(((int8_t)CLAMP(src[0], -128, 127)) & 0xff) << 8) ;
         value |= (uint32_t)(((int8_t)CLAMP(src[3], -128, 127)) & 0xff) ;
         *(uint32_t *)dst = value;
#else
         uint32_t value = 0;
         value |= (uint32_t)(((int8_t)CLAMP(src[2], -128, 127)) & 0xff) ;
         value |= (uint32_t)((uint32_t)(((int8_t)CLAMP(src[1], -128, 127)) & 0xff) << 8) ;
         value |= (uint32_t)((uint32_t)(((int8_t)CLAMP(src[0], -128, 127)) & 0xff) << 16) ;
         value |= (uint32_t)((uint32_t)((int8_t)CLAMP(src[3], -128, 127)) << 24) ;
         *(uint32_t *)dst = value;
#endif
         src += 4;
         dst += 4;
      }
      dst_row += dst_stride;
      src_row += src_stride/sizeof(*src_row);
   }
}

static inline void
util_format_b8g8r8a8_sint_fetch_signed(int *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)
{
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t b;
         int32_t g;
         int32_t r;
         int32_t a;
         b = ((int32_t)(value) ) >> 24;
         g = ((int32_t)(value << 8) ) >> 24;
         r = ((int32_t)(value << 16) ) >> 24;
         a = ((int32_t)(value << 24) ) >> 24;
         dst[0] = (int)r; /* r */
         dst[1] = (int)g; /* g */
         dst[2] = (int)b; /* b */
         dst[3] = (int)a; /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t b;
         int32_t g;
         int32_t r;
         int32_t a;
         b = ((int32_t)(value << 24) ) >> 24;
         g = ((int32_t)(value << 16) ) >> 24;
         r = ((int32_t)(value << 8) ) >> 24;
         a = ((int32_t)(value) ) >> 24;
         dst[0] = (int)r; /* r */
         dst[1] = (int)g; /* g */
         dst[2] = (int)b; /* b */
         dst[3] = (int)a; /* a */
#endif
}

static inline void
util_format_b8g8r8a8_sint_unpack_unsigned(unsigned *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
{
   unsigned x, y;
   for(y = 0; y < height; y += 1) {
      unsigned *dst = dst_row;
      const uint8_t *src = src_row;
      for(x = 0; x < width; x += 1) {
#if UTIL_ARCH_BIG_ENDIAN
         uint32_t value = *(const uint32_t *)src;
         int32_t b;
         int32_t g;
         int32_t r;
         int32_t a;
         b = ((int32_t)(value) ) >> 24;
         g = ((int32_t)(value << 8) ) >> 24;
         r = ((int32_t)(value << 16) ) >> 24;
         a = ((int32_t)(value << 24) ) >> 24;
         dst[0] = (unsigned)MAX2(r, 0); /* r */
         dst[1] = (unsigned)MAX2(g, 0); /* g */
         dst[2] = (unsigned)MAX2(b, 0); /* b */
         dst[3] = (unsigned)MAX2(a, 0); /* a */
#else
         uint32_t value = *(const uint32_t *)src;
         int32_t b;
         int32_t g;
         int32_t r;
         int32_t a;
         b = ((int32_t)(value << 24) ) >> 24;
         g = ((int32_t)(value << 16) ) >> 24;
         r = ((int32_t)(value << 8) ) >> 24;
         a = ((int32_t)(value) ) >> 24;
         dst[0] = (unsigned)