ffmpeg/libavcodec/parisc/dsputil_parisc.c
/*
* DSP utils optimizations for PA-RISC 2.0 using MAX instruction set
* Copyright (c) 2006 Thibaut VARENE <T-Bone@parisc-linux.org>
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License Version 2.1 as published by the Free Software Foundation.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* Alternatively, the contents of this file may be used under the
* terms of the GNU General Public License Version 2 (the "GPL"),
* as published by the Free Software Foundation, in which case the
* provisions of the GPL are applicable instead of those above.
*/
/* gcc -O2 -mpa-risc-2-0 -c dsputils_parisc.c */
/*
<tausq> r26 is your first argument
<tausq> for the other ones, if you want to use scratch registers in your asm, start with the argument registers (offset by how many real arguments you have) and work backwards. so if you have two arguments (r26, r25) then start with r24
<tausq> r19-r26 are call clobbered registers, so gcc doesn't have to save them to the stack first. if you use r3, ... gcc will spill them to the stack
<tausq> r19 will be saved if you are compiling PIC tho
<tausq> and if you will set the return value later, you can use r28 and r29
<tausq> or if your function is void ...
<tausq> so use them in that order.... r26-r20, r1, r28-r29
<tausq> i assume you mean something like ldw 4(r20),r20 ?
<tausq> that will stall, you should try to rotate your registers
*/
/* TODO: - optimize away loops (unroll...)?
- deal with unaligned cases (STBY, STDBY)
- cache hints?
*/
/* XXX Review the use of __asm__ __volatile__ vs asm volatile */
/* Userland is 32bit, *BE REALLY CAREFUL* when using 64bit regs and accessing memory.
Note: This code can only be used when running a wide kernel (context switches) */
#warning "must deal with alignments"
#undef DEBUG
#ifdef DEBUG
#define DASSERT(x) if (!(x)) av_log(NULL, AV_LOG_ERROR, "assertion failed %s line %d\n ", #x, __LINE__)
#else
#define DASSERT(x)
#endif
#undef USE_C
#if 0
#include <inttypes.h>
#define DCTELEM short
#define restrict __restrict
#else
#include "../dsputil.h"
#endif
/* binutils don't know (yet?) about these */
#define SHRDU(r,sa,t) "extrd,u " #r ",63-" #sa ",64-" #sa "," #t "\n\t"
#define SHLD(r,sa,t) "depd,z " #r ",63-" #sa ",64-" #sa "," #t "\n\t"
/* mpegvideo.c and h264.c are the users. They need prefetch read */
void prefetch_pa(void *mem, int stride, int h)
{
#if 0 /* x86 equiv */
const uint8_t *p= mem;
do{
asm volatile(#op" %0" :: "m"(*p));
p+= stride;
}while(--h);
#endif
DASSERT((stride % 8) == 0);
do {
asm volatile(
"ldd,m %1(%0),%%r0\n\t" /* prefetch read. ldw would prefetch write */
: "+r" (mem)
: "r" (stride)
: "memory");
} while (--h);
}
void clear_blocks_pa(DCTELEM *blocks)
{
uint64_t *p = (uint64_t *) blocks;
int n = sizeof(DCTELEM) * 6 * 64;
#ifdef USE_C /* C equiv */
do {
p[0] = 0;
p[1] = 0;
p[2] = 0;
p[3] = 0;
p[4] = 0;
p[5] = 0;
p[6] = 0;
p[7] = 0;
p += 8;
n -= 8 * 8;
} while (n);
#else
do {
__asm__ __volatile__(
"std %%r0,0(%0)\n\t"
"std %%r0,8(%0)\n\t"
"std %%r0,16(%0)\n\t"
"std %%r0,24(%0)\n\t"
"std %%r0,32(%0)\n\t"
"std %%r0,40(%0)\n\t"
"std %%r0,48(%0)\n\t"
"std %%r0,56(%0)\n\t"
:
: "r" (p)
: "memory");
p += 8; /* given compiler output i'm not sure i can do better */
n -= 8 * 8;
} while (n);
#endif
}
/* averaging together each (x,y) byte couple */
/* this function uses unbiased rounding, as such there's no need to differentiate
rnd/no_rnd on parisc. eg:
aa: 2, 3, 5, 9, 128, 190, 232, 254
bb: 0, 0, 0, 0, 1, 1, 1, 1
no rnd: 1, 1, 2, 4, 64, 95, 116, 127
rnd: 1, 2, 3, 5, 65, 96, 117, 128
avg(aa,bb): 1, 1, 3, 5, 65, 95, 117, 127
*/
/* TESTED OK */
void avg8x8(uint8_t *x, const uint8_t *y)
{
/* assert x[8] and y[8] - r26, r25 are params */
__asm__ __volatile__(
"ldd 0(%0),%%r24\n\t" /* x0x1x2x3x4x5x6x7 */
"hshl %%r24,8,%%r22\n\t" /* x100x300x500x700 */
"hshr,u %%r22,8,%%r21\n\t" /* 00x100x300x500x7 */
"hshr,u %%r24,8,%%r22\n\t" /* 00x000x200x400x6 */
"ldd 0(%1),%%r23\n\t" /* y0y1y2y3y4y5y6y7 */
"hshl %%r23,8,%%r20\n\t" /* y100y300y500y700 */
"hshr,u %%r20,8,%%r24\n\t" /* 00y100y300y500y7 */
"hshr,u %%r23,8,%%r20\n\t" /* 00y000y200y400y6 */
"havg %%r21,%%r24,%%r23\n\t" /* 00z100z300z500z7 - z=avg(x,y) unbiased rounding */
"havg %%r22,%%r20,%%r24\n\t" /* 00z000z200z400z6 - z=avg(x,y) unbiased rounding */
"hshl %%r24,8,%%r21\n\t" /* z000z200z400z600 */
"or %%r23,%%r21,%%r20\n\t" /* z0z1z2z3z4z5z6z7 */
"std %%r20,0(%0)\n\t" /* write back */
:
: "r" (x), "r" (y)
: "%r24", "%r23", "%r22", "%r21", "%r20", "memory");
}
#if 0 /* this can't work efficiently because of the 64bit retval */
uint64_t avg8x8r(uint8_t *x, const uint8_t *y)
{
uint64_t s __attribute__((aligned(8)));
/* assert x[8] and y[8] - r26, r25 are params */
__asm__ __volatile__(
"ldd 0(%1),%%r24\n\t" /* x0x1x2x3x4x5x6x7 */
"hshl %%r24,8,%%r22\n\t" /* x100x300x500x700 */
"hshr,u %%r22,8,%%r21\n\t" /* 00x100x300x500x7 */
"hshr,u %%r24,8,%%r22\n\t" /* 00x000x200x400x6 */
"ldd 0(%2),%%r23\n\t" /* y0y1y2y3y4y5y6y7 */
"hshl %%r23,8,%%r20\n\t" /* y100y300y500y700 */
"hshr,u %%r20,8,%%r24\n\t" /* 00y100y300y500y7 */
"hshr,u %%r23,8,%%r20\n\t" /* 00y000y200y400y6 */
"havg %%r21,%%r24,%%r23\n\t" /* 00z100z300z500z7 - z=avg(x,y) unbiased rounding */
"havg %%r22,%%r20,%%r24\n\t" /* 00z000z200z400z6 - z=avg(x,y) unbiased rounding */
"hshl %%r24,8,%%r21\n\t" /* z000z200z400z600 */
"or %%r23,%%r21,%0\n\t" /* z0z1z2z3z4z5z6z7 written */
: "=r" (s)
: "r" (x), "r" (y)
: "%r24", "%r23", "%r22", "%r21", "%r20", "memory");
return s;
}
#endif
/* adding together each (x,y) byte couple - unsigned saturation */
/* MAX implements brain-damaged "unsigned_sat_add(unsigned short, signed short) */
/* TESTED OK */
void add8x8us(uint8_t *x, const uint8_t *y)
{
#ifdef USE_C /* C equiv */
int i, s;
for (i=0; i<8; i++) {
s = x[i] + y[i];
x[i] = (s>255) ? 255 : (uint8_t)s;
}
#else
/* assert x[8] and y[8] */
__asm__ __volatile__( /* r26, r25, r24x0, r22x1 */
"uaddcm %%r0,%%r0,%%r24\n\t" /* FFFFFFFFFFFFFFFF */
"hshl %%r24,1,%%r1\n\t" /* FFFEFFFEFFFEFFFE */
"ldd 0(%0),%%r24\n\t" /* x0x1x2x3x4x5x6x7 */
"hshl %%r24,8,%%r22\n\t" /* x100x300x500x700 r22 */
"hshr,u %%r24,8,%%r21\n\t" /* 00x000x200x400x6 */
"hshl %%r21,8,%%r24\n\t" /* x000x200x400x600 r24 */
"ldd 0(%1),%%r23\n\t" /* y0y1y2y3y4y5y6y7 r23 */
"hshl %%r23,8,%%r20\n\t" /* y100y300y500y700 */
"hshr,u %%r20,8,%%r21\n\t" /* 00y100y300y500y7 */
"andcm %%r21,%%r1,%%r20\n\t" /* save lsb */
"hshl %%r20,8,%%r28\n\t" /* shift back lsb r28 */
"hshr,u %%r21,1,%%r20\n\t" /* 00y100y300y500y7/2 */
"hshl %%r20,8,%%r21\n\t" /* y100y300y500y700/2 r21 */
"hadd,us %%r22,%%r21,%%r20\n\t" /* odd x+(y/2) !r22 */
"hadd,us %%r20,%%r21,%%r22\n\t" /* odd x+(2*(y/2)) !r21 */
"hadd,us %%r22,%%r28,%%r21\n\t" /* odd x+(2*(y/2))+lsb r21,!r28 */
"hshr,u %%r23,8,%%r22\n\t" /* 00y000y200y400y6 !r23 */
"andcm %%r22,%%r1,%%r20\n\t" /* save lsb */
"hshl %%r20,8,%%r28\n\t" /* shift back lsb r28 */
"hshr,u %%r22,1,%%r20\n\t" /* divide by 2 */
"hshl %%r20,8,%%r23\n\t" /* y000y200y400y600/2 */
"hadd,us %%r24,%%r23,%%r22\n\t" /* even x+(y/2) !r24*/
"hadd,us %%r22,%%r23,%%r24\n\t" /* even x+(2*(y/2)) */
"hadd,us %%r24,%%r28,%%r22\n\t" /* even x+(2*(y/2))+lsb r22,!r28 */
"hshr,u %%r21,8,%%r20\n\t" /* 00z100z300z500z7 */
"hshr,u %%r22,8,%%r23\n\t" /* 00z000z200z400z6 */
"hshl %%r23,8,%%r22\n\t" /* z000z200z400z600 */
"or %%r22,%%r20,%%r24\n\t" /* z0z1z2z3z4z5z6z7 */
"std %%r24,0(%0)\n\t" /* write back */
:
: "r" (x), "r" (y)
: "%r24", "%r23", "%r22", "%r21", "%r20", "%r1", "%r28", "memory");
#endif
}
/* adding together each (x,y) byte couple - modular arithmetics */
/* TESTED OK */
void add8x8(uint8_t *x, const uint8_t *y)
{
#ifdef USE_C /* C equiv */
int i;
for (i=0; i<8; i++)
x[i] += y[i];
#else
/* assert x[8] and y[8] */
__asm__ __volatile__( /* r26, r25 */
"ldd 0(%0),%%r1\n\t" /* x0x1x2x3x4x5x6x7 */
"hshl %%r1,8,%%r21\n\t" /* x100x300x500x700 */
"hshr,u %%r21,8,%%r23\n\t" /* 00x100x300x500x7 */
"hshr,u %%r1,8,%%r22\n\t" /* 00x000x200x400x6 */
"ldd 0(%1),%%r1\n\t" /* y0y1y2y3y4y5y6y7 */
"hshl %%r1,8,%%r21\n\t" /* y100y300y500y700 */
"hshr,u %%r21,8,%%r20\n\t" /* 00y100y300y500y7 */
"hshr,u %%r1,8,%%r21\n\t" /* 00y000y200y400y6 */
"hadd %%r23,%%r20,%%r24\n\t" /* ZZz1ZZz3ZZz5ZZz7 - z=(x+y) modular */
"hadd %%r22,%%r21,%%r23\n\t" /* ZZz0ZZz2ZZz4ZZz6 - z=(x+y) modular */
"hshl %%r23,8,%%r1\n\t" /* z000z200z400z600 */
"hshl %%r24,8,%%r22\n\t" /* z100z300z500z700 */
"hshr,u %%r22,8,%%r24\n\t" /* 00z100z300z500z7 */
"or %%r24,%%r1,%%r20\n\t" /* z0z1z2z3z4z5z6z7 */
"std %%r20,0(%0)\n\t" /* write back */
:
: "r" (x), "r" (y)
: "%r24", "%r23", "%r22", "%r21", "%r20", "%r1", "memory");
#endif
}
/* dst and src are 16 bytes aligned */
void add_bytes_max(uint8_t *dst, const uint8_t *src, int w)
{
int i;
for (i=0; i+7<w; i+=8)
add8x8(dst+i, src+i);
for (; i<w; i++)
dst[i+0] += src[i+0];
}
/* subbing together each (x,y) byte couple - unsigned saturation */
/* TESTED OK */
void sub8x8us(uint8_t *dst, const uint8_t *x, const uint8_t *y)
{
#ifdef USE_C /* C equiv */
int i, s;
for (i=0; i<8; i++) {
s = x[i] - y[i];
dst[i] = (s<0) ? 0 : (uint8_t)s;
}
#else
/* assert x[8] and y[8] - args r26, r25, r24 */
__asm__ __volatile__(
"ldd 0(%1),%%r1\n\t" /* x0x1x2x3x4x5x6x7 */
"hshl %%r1,8,%%r21\n\t" /* x100x300x500x700 */
"hshr,u %%r21,8,%%r23\n\t" /* 00x100x300x500x7 */
"hshr,u %%r1,8,%%r22\n\t" /* 00x000x200x400x6 */
"ldd 0(%2),%%r1\n\t" /* y0y1y2y3y4y5y6y7 */
"hshl %%r1,8,%%r21\n\t" /* y100y300y500y700 */
"hshr,u %%r21,8,%%r20\n\t" /* 00y100y300y500y7 */
"hshr,u %%r1,8,%%r21\n\t" /* 00y000y200y400y6 */
"hsub,us %%r23,%%r20,%%r29\n\t" /* 00z100z300z500z7 - z=(x-y) unsigned sat */
"hsub,us %%r22,%%r21,%%r28\n\t" /* 00z000z200z400z6 - z=(x-y) unsigned sat */
"hshl %%r28,8,%%r1\n\t" /* z000z200z400z600 */
"or %%r29,%%r1,%%r20\n\t" /* z0z1z2z3z4z5z6z7 */
"std %%r20,0(%0)\n\t" /* write back */
:
: "r" (dst), "r" (x), "r" (y)
: "%r1", "%r23", "%r22", "%r21", "%r20", "%r28", "%r29", "memory");
#endif
}
/* subbing together each (x,y) byte couple - modular arithmetics */
/* TESTED OK */
void sub8x8(uint8_t *dst, const uint8_t *x, const uint8_t *y)
{
#ifdef USE_C /* C equiv */
int i;
for (i=0; i<8; i++)
dst[i] = x[i] - y[i];
#else
/* assert x[8] and y[8] - args r26, r25, r24 */
__asm__ __volatile__(
"ldd 0(%1),%%r1\n\t" /* x0x1x2x3x4x5x6x7 */
"hshl %%r1,8,%%r21\n\t" /* x100x300x500x700 */
"hshr,u %%r21,8,%%r23\n\t" /* 00x100x300x500x7 */
"hshr,u %%r1,8,%%r22\n\t" /* 00x000x200x400x6 */
"ldd 0(%2),%%r1\n\t" /* y0y1y2y3y4y5y6y7 */
"hshl %%r1,8,%%r21\n\t" /* y100y300y500y700 */
"hshr,u %%r21,8,%%r20\n\t" /* 00y100y300y500y7 */
"hshr,u %%r1,8,%%r21\n\t" /* 00y000y200y400y6 */
"hsub %%r23,%%r20,%%r1\n\t" /* ZZz1ZZz3ZZz5ZZz7 - z=(x-y) modular */
"hsub %%r22,%%r21,%%r23\n\t" /* ZZz0ZZz2ZZz4ZZz6 - z=(x-y) modular */
"hshl %%r23,8,%%r20\n\t" /* z000z200z400z600 */
"hshl %%r1,8,%%r22\n\t" /* z100z300z500z700 */
"hshr,u %%r22,8,%%r1\n\t" /* 00z100z300z500z7 */
"or %%r1,%%r20,%%r21\n\t" /* z0z1z2z3z4z5z6z7 */
"std %%r21,0(%0)\n\t" /* write back */
:
: "r" (dst), "r" (x), "r" (y)
: "%r23", "%r22", "%r21", "%r20", "%r1", "memory");
#endif
}
void diff_bytes_max(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w)
{
int i;
for (i=0; i+7<w; i+=8)
sub8x8(dst+i, src1+i, src2+i);
for (; i<w; i++)
dst[i+0] = src1[i+0]-src2[i+0];
}
/* TESTED OK */
int pix_sum_max(uint8_t *pix, int line_size)
{
#ifdef USE_C /* C equiv */
int s, i, j;
s = 0;
for (i = 0; i < 16; i++) {
for (j = 0; j < 16; j += 8) {
s += pix[0];
s += pix[1];
s += pix[2];
s += pix[3];
s += pix[4];
s += pix[5];
s += pix[6];
s += pix[7];
pix += 8;
}
pix += line_size - 16;
}
return s;
#else
int i;
int s __attribute__((aligned(8)));
s = 0; /* XXX align */
for (i = 0; i < 16; i++) {
/* r26, r25 args */
__asm__ __volatile__(
"ldd 0(%1),%%r24\n\t" /* x0x1x2x3x4x5x6x7 */
"hshl %%r24,8,%%r21\n\t" /* x100x300x500x700 */
"hshr,u %%r24,8,%%r22\n\t" /* 00x000x200x400x6 */
"hshr,u %%r21,8,%%r28\n\t" /* 00x100x300x500x7 */
"ldd 8(%1),%%r23\n\t" /* x8x9xaxbxcxdxexf */
"hshl %%r23,8,%%r1\n\t" /* x900xb00xd00xf00 */
"hshr,u %%r23,8,%%r20\n\t" /* 00x800xa00xc00xe */
"hshr,u %%r1,8,%%r29\n\t" /* 00x900xb00xd00xf */
"hadd %%r22,%%r28,%%r21\n\t" /* aaaabbbbccccdddd - a=x0+x1, b=x2+x3...*/
"hadd %%r20,%%r29,%%r1\n\t" /* eeeeffffgggghhhh - e=x8+x9, f=xa+xb...*/
"hadd %%r21,%%r1,%%r24\n\t" /* iiiijjjjkkkkllll - i=a+e, j=b+f... */
"permh,1133 %%r24,%%r23\n\t" /* jjjjjjjjllllllll */
"hadd,us %%r24,%%r23,%%r1\n\t" /* AAAABBBBCCCCDDDD - A=i+j, C=k+l, B,D unused */
SHLD(%%r1,32,%%r20) /* CCCCDDDD00000000 */
"add,l %%r1,%%r20,%%r24\n\t" /* EEEEFFFFCCCCDDDD - E=A+C, F unused */
SHRDU(%%r24,48,%0) /* 000000000000EEEE - E=sum(x1:xf) written*/
: "=r" (s)
: "r" (pix)
: "%r24", "%r23", "%r22", "%r21", "%r20", "%r1", "%r29", "%r28" /*, "memory"*/);
pix += line_size - 16;
}
return s;
#endif
}
/* TESTED OK */
void get_pixels_max(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
{
#ifdef USE_C /* C equiv */
int i;
/* read the pixels */
for(i=0;i<8;i++) {
block[0] = pixels[0];
block[1] = pixels[1];
block[2] = pixels[2];
block[3] = pixels[3];
block[4] = pixels[4];
block[5] = pixels[5];
block[6] = pixels[6];
block[7] = pixels[7];
pixels += line_size;
block += 8;
}
#else
int i;
/* read the pixels */
for(i=0;i<8;i++) {
/* r26, r25, r24 args */
__asm__ __volatile__(
"ldd,m %2(%1),%%r23\n\t" /* x0x1x2x3x4x5x6x7; pixels+=ls */
"hshl %%r23,8,%%r21\n\t" /* x100x300x500x700 */
"hshr,u %%r23,8,%%r22\n\t" /* 00x000x200x400x6 unsigned */
"hshr,u %%r21,8,%%r20\n\t" /* 00x100x300x500x7 unsigned */
"mixh,l %%r22,%%r20,%%r21\n\t" /* 00x000x100x400x5 */
"mixh,r %%r22,%%r20,%%r23\n\t" /* 00x200x300x600x7 */
"mixw,l %%r21,%%r23,%%r20\n\t" /* 00x000x100x200x3 */
"std,ma %%r20,8(%0)\n\t" /* write back; block+=4 */
"mixw,r %%r21,%%r23,%%r22\n\t" /* 00x400x500x600x7 */
"std,ma %%r22,8(%0)\n\t" /* write back; block+=4 */
: "+r" (block), "+r" (pixels)
: "r" (line_size)
: "%r23", "%r22", "%r21", "%r20", "memory");
}
#endif
}
/* TESTED OK */
void diff_pixels_max(DCTELEM *restrict block, const uint8_t *s1,
const uint8_t *s2, int stride)
{
int i;
#ifdef USE_C /* C equiv */
/* read the pixels */
for(i=0;i<8;i++) {
block[0] = s1[0] - s2[0];
block[1] = s1[1] - s2[1];
block[2] = s1[2] - s2[2];
block[3] = s1[3] - s2[3];
block[4] = s1[4] - s2[4];
block[5] = s1[5] - s2[5];
block[6] = s1[6] - s2[6];
block[7] = s1[7] - s2[7];
s1 += stride;
s2 += stride;
block += 8;
}
#else
/* read the pixels r26, r25, r24, r23 args */
for(i=0;i<8;i++) {
__asm__ __volatile__(
"ldd,m %3(%1),%%r22\n\t" /* x0x1x2x3x4x5x6x7; s1+=stride */
"hshl %%r22,8,%%r21\n\t" /* x100x300x500x700 */
"hshr,u %%r22,8,%%r20\n\t" /* 00x000x200x400x6 */
"hshr,u %%r21,8,%%r1\n\t" /* 00x100x300x500x7 */
"ldd,m %3(%2),%%r22\n\t" /* y0y1y2y3y4y5y6y7; s2+=stride */
"hshl %%r22,8,%%r21\n\t" /* y100y300y500y700 */
"hshr,u %%r22,8,%%r28\n\t" /* 00y000y200y400y6 */
"hshr,u %%r21,8,%%r29\n\t" /* 00y100y300y500y7 */
"hsub %%r20,%%r28,%%r21\n\t" /* ZZz0ZZz2ZZz4ZZz6 - z=(x-y) modular */
"hsub %%r1,%%r29,%%r22\n\t" /* ZZz1ZZz3ZZz5ZZz7 - z=(x-y) modular */
"mixh,l %%r21,%%r22,%%r20\n\t" /* 00z000z100z400z5 */
"mixh,r %%r21,%%r22,%%r1\n\t" /* 00z200z300z600z7 */
"mixw,l %%r20,%%r1,%%r28\n\t" /* 00z000z100z200z3 */
"std,ma %%r28,8(%0)\n\t" /* write back; block+=4 */
"mixw,r %%r20,%%r1,%%r29\n\t" /* 00z400z500z600z7 */
"std,ma %%r29,8(%0)\n\t" /* write back; block+=4 */
: "+r" (block), "+r" (s1), "+r" (s2)
: "r" (stride)
: "%r22", "%r21", "%r20", "%r1", "%r28", "%r29", "memory");
}
#endif
}
/* will block+=line_size; pixels+=line_size */
#define PUT8_MAX(block, pixels, line_size) \
__asm__ __volatile__( \
"ldd,m %2(%1),%%r20\n\t" \
"std %%r20,0(%0)\n\t" \
"add %2,%0,%0\n\t" \
: "+r" (block), "+r" (pixels)\
: "r" (line_size) \
: "%r20", "memory")
/* will block+=line_size; pixels+=line_size */
#define PUT16_MAX(block, pixels, line_size) \
__asm__ __volatile__( \
"ldd 8(%1),%%r20\n\t" \
"std %%r20,8(%0)\n\t" \
"ldd,m %2(%1),%%r21\n\t" \
"std %%r21,0(%0)\n\t" \
"add %2,%0,%0\n\t" \
: "+r" (block), "+r" (pixels)\
: "r" (line_size) \
: "%r20", "%r21", "memory")
#define AVG8_MAX(block, pixels, line_size) \
__asm__ __volatile__( \
"ldd 0(%0),%%r1\n\t" /* x0x1x2x3x4x5x6x7 */ \
"hshl %%r1,8,%%r22\n\t" /* x100x300x500x700 */ \
"hshr,u %%r22,8,%%r21\n\t" /* 00x100x300x500x7 */ \
"hshr,u %%r1,8,%%r22\n\t" /* 00x000x200x400x6 */ \
"ldd,m %2(%1),%%r28\n\t" /* y0y1y2y3y4y5y6y7; pixels+=ls */ \
"hshl %%r28,8,%%r20\n\t" /* y100y300y500y700 */ \
"hshr,u %%r20,8,%%r1\n\t" /* 00y100y300y500y7 */ \
"hshr,u %%r28,8,%%r20\n\t" /* 00y000y200y400y6 */ \
"havg %%r21,%%r1,%%r28\n\t" /* 00z100z300z500z7 - z=avg(x,y) unbiased rounding */ \
"havg %%r22,%%r20,%%r1\n\t" /* 00z000z200z400z6 - z=avg(x,y) unbiased rounding */ \
"hshl %%r1,8,%%r21\n\t" /* z000z200z400z600 */ \
"or %%r28,%%r21,%%r20\n\t" /* z0z1z2z3z4z5z6z7 */ \
"std %%r20,0(%0)\n\t" /* write back */ \
"add %2,%0,%0\n\t" /* block+=line_size */ \
: "+r" (block), "+r" (pixels) \
: "r" (line_size) \
: "%r1", "%r28", "%r22", "%r21", "%r20", "memory")
#define AVG16_MAX(block, pixels) \
__asm__ __volatile__( \
/* deal with the far 64 bits first: pixels+8 and block+8 */ \
"ldd 8(%0),%%r1\n\t" /* x0x1x2x3x4x5x6x7 */ \
"hshl %%r1,8,%%r22\n\t" /* x100x300x500x700 */ \
"hshr,u %%r22,8,%%r21\n\t" /* 00x100x300x500x7 */ \
"hshr,u %%r1,8,%%r22\n\t" /* 00x000x200x400x6 */ \
"ldd 8(%1),%%r28\n\t" /* y0y1y2y3y4y5y6y7 */ \
"hshl %%r28,8,%%r20\n\t" /* y100y300y500y700 */ \
"hshr,u %%r20,8,%%r1\n\t" /* 00y100y300y500y7 */ \
"hshr,u %%r28,8,%%r20\n\t" /* 00y000y200y400y6 */ \
"havg %%r21,%%r1,%%r28\n\t" /* 00z100z300z500z7 - z=avg(x,y) unbiased rounding */ \
"havg %%r22,%%r20,%%r1\n\t" /* 00z000z200z400z6 - z=avg(x,y) unbiased rounding */ \
"hshl %%r1,8,%%r21\n\t" /* z000z200z400z600 */ \
"or %%r28,%%r21,%%r20\n\t" /* z0z1z2z3z4z5z6z7 */ \
"std %%r20,8(%0)\n\t" /* write back */ \
/* now the first 64 bits */ \
"ldd 0(%0),%%r1\n\t" /* x0x1x2x3x4x5x6x7 */ \
"hshl %%r1,8,%%r22\n\t" /* x100x300x500x700 */ \
"hshr,u %%r22,8,%%r21\n\t" /* 00x100x300x500x7 */ \
"hshr,u %%r1,8,%%r22\n\t" /* 00x000x200x400x6 */ \
"ldd,m %2(%1),%%r28\n\t" /* y0y1y2y3y4y5y6y7; pixels+=ls */ \
"hshl %%r28,8,%%r20\n\t" /* y100y300y500y700 */ \
"hshr,u %%r20,8,%%r1\n\t" /* 00y100y300y500y7 */ \
"hshr,u %%r28,8,%%r20\n\t" /* 00y000y200y400y6 */ \
"havg %%r21,%%r1,%%r28\n\t" /* 00z100z300z500z7 - z=avg(x,y) unbiased rounding */ \
"havg %%r22,%%r20,%%r1\n\t" /* 00z000z200z400z6 - z=avg(x,y) unbiased rounding */ \
"hshl %%r1,8,%%r21\n\t" /* z000z200z400z600 */ \
"or %%r28,%%r21,%%r20\n\t" /* z0z1z2z3z4z5z6z7 */ \
"std %%r20,0(%0)\n\t" /* write back */ \
"add %2,%0,%0\n\t" /* block+=line_size */ \
: "+r" (block), "+r" (pixels) \
: "r" (line_size) \
: "%r1", "%r28", "%r22", "%r21", "%r20", "memory")
void avg_pixels16_max(uint8_t *block, const uint8_t *pixels, int line_size, int h)
{
DASSERT((line_size % 16) == 0);
int i;
for (i=0; i<h; i++)
AVG16_MAX(block, pixels);
}
void avg_pixels8_max(uint8_t *block, const uint8_t *pixels, int line_size, int h)
{
#if 0
#define OP(a,b) a = avg(a,b)
int i;
for(i=0; i<h; i++){
OP(*((uint64_t*)block), LD64(pixels));
pixels+=line_size;
block +=line_size;
}
#endif
DASSERT((line_size % 8) == 0);
int i;
for (i=0; i<h; i++)
AVG8_MAX(block, pixels, line_size);
}
void avg_pixels8_x2_max(uint8_t *block, const uint8_t *pixels, int line_size, int h)
{
#if 0
#define OP(a,b) a = avg(a,b)
int i;
for(i=0; i<h; i++){
const uint64_t a= LD64(pixels );
const uint64_t b= LD64(pixels+1);
OP(*((uint64_t*)block), avg(a,b));
pixels+=line_size;
block +=line_size;
}
#endif
DASSERT((line_size % 8) == 0);
/* r26, r25, r24, r23 are params */
int i;
for(i=0; i<h; i++) {
__asm__ __volatile__(
"ldd 0(%1),%%r1\n\t" /* x0x1x2x3x4x5x6x7 */
SHLD(%%r1,8,%%r20) /* x1x2x3x4x5x6x700 */
"hshl %%r1,8,%%r22\n\t" /* x100x300x500x700 */
"hshr,u %%r22,8,%%r21\n\t" /* 00x100x300x500x7 */
"hshr,u %%r1,8,%%r22\n\t" /* 00x000x200x400x6 */
"ldd 8(%1),%%r28\n\t" /* x8x9xaxbxcxdxexf - ldd will cache for the next iteration */
SHRDU(%%r28,56,%%r1) /* 00000000000000x8 */
"or %%r20,%%r1,%%r28\n\t" /* x1x2x3x4x5x6x7x8 aka pixels+1; aka Y */
"hshl %%r28,8,%%r20\n\t" /* y100y300y500y700 */
"hshr,u %%r20,8,%%r1\n\t" /* 00y100y300y500y7 */
"hshr,u %%r28,8,%%r20\n\t" /* 00y000y200y400y6 */
"havg %%r21,%%r1,%%r28\n\t" /* 00z100z300z500z7 - z=avg(x,y) unbiased rounding */
"havg %%r22,%%r20,%%r1\n\t" /* 00z000z200z400z6 - z=avg(x,y) unbiased rounding */
/* get block ready */
"ldd 0(%0),%%r20\n\t" /* w0w1w2w3w4w5w6w7 */
"hshl %%r20,8,%%r22\n\t" /* w100w300w500w700 */
"hshr,u %%r22,8,%%r21\n\t" /* 00w100w300w500w7 */
"hshr,u %%r20,8,%%r22\n\t" /* 00w000w200w400w6 */
/* average the whole family */
"havg %%r21,%%r28,%%r20\n\t" /* 00v100v300v500v7 - v=avg(z,w) unbiased rounding */
"havg %%r22,%%r1,%%r21\n\t" /* 00v000v200v400v6 - v=avg(z,w) unbiased rounding */
"hshl %%r21,8,%%r28\n\t" /* v000v200v400v600 */
"or %%r20,%%r28,%%r1\n\t" /* v0v1v2v3v4v5v6v7 */
"std %%r1,0(%0)\n\t" /* write back */
"add %2,%0,%0\n\t" /* block+=ls */
: "+r" (block), "+r" (pixels)
: "r" (line_size)
: "%r1", "%r28", "%r22", "%r21", "%r20", "memory");
pixels += line_size; /* move the second ldd at top and use ldd,ma %2(%1) for the next one */
}
}
void avg_pixels8_y2_max(uint8_t *block, const uint8_t *pixels, int line_size, int h)
{
#if 0
#define OP(a,b) a = avg(a,b)
int i;
for(i=0; i<h; i++){
const uint64_t a= LD64(pixels );
const uint64_t b= LD64(pixels+line_size);
OP(*((uint64_t*)block), avg(a,b));
pixels+=line_size;
block +=line_size;
}
#endif
DASSERT((line_size % 8) == 0);
/* r26, r25, r24, r23 are params */
int i;
for(i=0; i<h; i++) {
__asm__ __volatile__(
"ldd,m %2(%1),%%r1\n\t" /* x0x1x2x3x4x5x6x7; pixels+=ls */
"hshl %%r1,8,%%r22\n\t" /* x100x300x500x700 */
"hshr,u %%r22,8,%%r21\n\t" /* 00x100x300x500x7 */
"hshr,u %%r1,8,%%r22\n\t" /* 00x000x200x400x6 */
"ldd 0(%1),%%r28\n\t" /* X+line_size, aka Y */
"hshl %%r28,8,%%r20\n\t" /* y100y300y500y700 */
"hshr,u %%r20,8,%%r1\n\t" /* 00y100y300y500y7 */
"hshr,u %%r28,8,%%r20\n\t" /* 00y000y200y400y6 */
"havg %%r21,%%r1,%%r28\n\t" /* 00z100z300z500z7 - z=avg(x,y) unbiased rounding */
"havg %%r22,%%r20,%%r1\n\t" /* 00z000z200z400z6 - z=avg(x,y) unbiased rounding */
/* get block ready */
"ldd 0(%0),%%r20\n\t" /* w0w1w2w3w4w5w6w7 */
"hshl %%r20,8,%%r22\n\t" /* w100w300w500w700 */
"hshr,u %%r22,8,%%r21\n\t" /* 00w100w300w500w7 */
"hshr,u %%r20,8,%%r22\n\t" /* 00w000w200w400w6 */
/* average the whole family */
"havg %%r21,%%r28,%%r20\n\t" /* 00v100v300v500v7 - v=avg(z,w) unbiased rounding */
"havg %%r22,%%r1,%%r21\n\t" /* 00v000v200v400v6 - v=avg(z,w) unbiased rounding */
"hshl %%r21,8,%%r28\n\t" /* v000v200v400v600 */
"or %%r20,%%r28,%%r1\n\t" /* v0v1v2v3v4v5v6v7 */
"std %%r1,0(%0)\n\t" /* write back */
"add %2,%0,%0\n\t" /* block+=ls */
: "+r" (block), "+r" (pixels)
: "r" (line_size)
: "%r1", "%r28", "%r22", "%r21", "%r20", "memory");
}
}
void put_pixels16_max(uint8_t *block, const uint8_t *pixels, int line_size, int h)
{
DASSERT((line_size % 16) == 0);
int i;
for(i=0; i<h; i++)
PUT16_MAX(block, pixels, line_size);
}
void put_pixels8_max(uint8_t *block, const uint8_t *pixels, int line_size, int h)
{
#if 0
#define OP(a,b) a = b
int i;
for(i=0; i<h; i++){
OP(*((uint64_t*)block), LD64(pixels));
pixels+=line_size;
block +=line_size;
}
#endif
DASSERT((line_size % 8) == 0);
int i;
for(i=0; i<h; i++)
PUT8_MAX(block, pixels, line_size);
}
void put_pixels8_x2_max(uint8_t *block, const uint8_t *pixels, int line_size, int h)
{
#if 0
#define OP(a,b) a = b
int i;
for(i=0; i<h; i++){
const uint64_t a= LD64(pixels );
const uint64_t b= LD64(pixels+1);
OP(*((uint64_t*)block), avg(a,b));
pixels+=line_size;
block +=line_size;
}
#endif
DASSERT((line_size % 8) == 0);
/* r26, r25, r24, r23 are params */
int i;
for(i=0; i<h; i++) {
__asm__ __volatile__(
"ldd 0(%1),%%r1\n\t" /* x0x1x2x3x4x5x6x7 */
SHLD(%%r1,8,%%r20) /* x1x2x3x4x5x6x700 */
"hshl %%r1,8,%%r22\n\t" /* x100x300x500x700 */
"hshr,u %%r22,8,%%r21\n\t" /* 00x100x300x500x7 */
"hshr,u %%r1,8,%%r22\n\t" /* 00x000x200x400x6 */
"ldd 8(%1),%%r28\n\t" /* x8x9xaxbxcxdxexf - ldd will cache for the next iteration */
SHRDU(%%r28,56,%%r1) /* 00000000000000x8 */
"or %%r20,%%r1,%%r28\n\t" /* x1x2x3x4x5x6x7x8 aka pixels+1; aka Y */
"hshl %%r28,8,%%r20\n\t" /* y100y300y500y700 */
"hshr,u %%r20,8,%%r1\n\t" /* 00y100y300y500y7 */
"hshr,u %%r28,8,%%r20\n\t" /* 00y000y200y400y6 */
"havg %%r21,%%r1,%%r28\n\t" /* 00z100z300z500z7 - z=avg(x,y) unbiased rounding */
"havg %%r22,%%r20,%%r1\n\t" /* 00z000z200z400z6 - z=avg(x,y) unbiased rounding */
"hshl %%r1,8,%%r21\n\t" /* z000z200z400z600 */
"or %%r28,%%r21,%%r20\n\t" /* z0z1z2z3z4z5z6z7 */
"std %%r20,0(%0)\n\t" /* write back */
"add %2,%0,%0\n\t" /* block+=ls */
: "+r" (block), "+r" (pixels)
: "r" (line_size)
: "%r1", "%r28", "%r22", "%r21", "%r20", "memory");
pixels += line_size; /* move the second ldd at top and use ldd,ma %2(%1) for the next one */
}
}
void put_pixels8_y2_max(uint8_t *block, const uint8_t *pixels, int line_size, int h)
{
#if 0
#define OP(a,b) a = b
int i;
for(i=0; i<h; i++){
const uint64_t a= LD64(pixels );
const uint64_t b= LD64(pixels+line_size);
OP(*((uint64_t*)block), avg(a,b));
pixels+=line_size;
block +=line_size;
}
#endif
DASSERT((line_size % 8) == 0);
/* r26, r25, r24, r23 are params */
int i;
for(i=0; i<h; i++) {
__asm__ __volatile__(
"ldd,m %2(%1),%%r1\n\t" /* x0x1x2x3x4x5x6x7; pixels+=ls */
"hshl %%r1,8,%%r22\n\t" /* x100x300x500x700 */
"hshr,u %%r22,8,%%r21\n\t" /* 00x100x300x500x7 */
"hshr,u %%r1,8,%%r22\n\t" /* 00x000x200x400x6 */
"ldd 0(%1),%%r28\n\t" /* X+line_size, aka Y */
"hshl %%r28,8,%%r20\n\t" /* y100y300y500y700 */
"hshr,u %%r20,8,%%r1\n\t" /* 00y100y300y500y7 */
"hshr,u %%r28,8,%%r20\n\t" /* 00y000y200y400y6 */
"havg %%r21,%%r1,%%r28\n\t" /* 00z100z300z500z7 - z=avg(x,y) unbiased rounding */
"havg %%r22,%%r20,%%r1\n\t" /* 00z000z200z400z6 - z=avg(x,y) unbiased rounding */
"hshl %%r1,8,%%r21\n\t" /* z000z200z400z600 */
"or %%r28,%%r21,%%r20\n\t" /* z0z1z2z3z4z5z6z7 */
"std %%r20,0(%0)\n\t" /* write back */
"add %2,%0,%0\n\t" /* block+=ls */
: "+r" (block), "+r" (pixels)
: "r" (line_size)
: "%r1", "%r28", "%r22", "%r21", "%r20", "memory");
}
}
#if 1
void dsputil_init_parisc(DSPContext* c, AVCodecContext *avctx)
{
c->prefetch = prefetch_pa;
c->clear_blocks = clear_blocks_pa;
c->add_bytes = add_bytes_max;
c->diff_bytes = diff_bytes_max;
c->pix_sum = pix_sum_max;
c->get_pixels = get_pixels_max;
c->diff_pixels = diff_pixels_max;
/* thanks to unbiased rounding we don't have to differentiate no_rnd funcs */
/* in these funcs, block is 8 or 16 aligned, pixels is only 1. XXX to be
dealt with. Also note that h for op_pixels_func is limited to {width/2, width}
but never larger than 16 and never smaller then 4 */
c->put_pixels_tab[0][0] = put_pixels16_max;
c->put_pixels_tab[1][0] = put_pixels8_max;
c->put_pixels_tab[1][1] = put_pixels8_x2_max;
c->put_pixels_tab[1][2] = put_pixels8_y2_max;
c->put_no_rnd_pixels_tab[0][0] = put_pixels16_max;
c->put_no_rnd_pixels_tab[1][0] = put_pixels8_max;
c->put_no_rnd_pixels_tab[1][1] = put_pixels8_x2_max;
c->put_no_rnd_pixels_tab[1][2] = put_pixels8_y2_max;
c->avg_pixels_tab[0][0] = avg_pixels16_max;
c->avg_pixels_tab[1][0] = avg_pixels8_max;
c->avg_pixels_tab[1][1] = avg_pixels8_x2_max;
c->avg_pixels_tab[1][2] = avg_pixels8_y2_max;
c->avg_no_rnd_pixels_tab[0][0] = avg_pixels16_max;
c->avg_no_rnd_pixels_tab[1][0] = avg_pixels8_max;
c->avg_no_rnd_pixels_tab[1][1] = avg_pixels8_x2_max;
c->avg_no_rnd_pixels_tab[1][2] = avg_pixels8_y2_max;
#if 0 /* TODO */
c->avg_pixels_tab[1][3] = avg_pixels8_xy2_altivec;
c->put_pixels_tab[1][3] = put_pixels8_xy2_altivec;
c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_altivec;
c->put_pixels_tab[0][3] = put_pixels16_xy2_altivec;
c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_altivec;
#endif
}
#endif
Generated by GNU enscript 1.6.4.