1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233
|
bool CLXD3DEngine::ProcessDatas(XCMedia::IVideoData* pData)
{
if (!HWD3D_texture)
{
return false;
}
D3DLOCKED_RECT lock;
RECT lrect;
lrect.left = 0;
lrect.top = 0;
lrect.right = (long)pData->GetSize().GetWidth();
lrect.bottom = (long)pData->GetSize().GetHeight();
long lSourcePitch = pData->GetPitchInByte();
long lNBLines = pData->IsInterleaved() ? (long)(pData->GetSize().GetHeight()) >> 1 : (long)pData->GetSize().GetHeight();
long lXOffset = (long)(pData->GetSourceRect().GetLeft()) * pData->GetBitCount();
long lYOffset = (long)(pData->GetSourceRect().GetTop()) * lSourcePitch;
if (FAILED(HWD3D_texture->LockRect (0, &lock, NULL, D3DLOCK_DISCARD)))
{
return false;
}
else
{
BYTE* pSourceByte = pData->GetData();
BYTE* pTargetByte = (BYTE*)lock.pBits;
long lTargetPitch = lock.Pitch;
pSourceByte += lYOffset;
// Upload with conversion.
switch (pData->GetFOURCC())
{
case FCC_YUY2:
{
for (int y = 0; y < lNBLines; y++)
{
BYTE *_in = pSourceByte + lXOffset;
BYTE *_out = pTargetByte;
int *yuv = (int *)_in;
int scanwidth = (long)pData->GetSourceRect().GetWidth();
// C Padding.
for ( int x = 0; x < (scanwidth & 3); x++ )
{
int YUY2 = *yuv++;
int V = (YUY2 >> 24) & 0xff;
int Y2 = (YUY2 >> 16) & 0xff;
int U = (YUY2 >> 8) & 0xff;
int Y1 = (YUY2) & 0xff;
// Fixed components for 2 pixels.
int D = U - 128;
int E = V - 128;
int D100 = D * 100; // Ux100
int E409A = E * 409 + 128; // Vx409
int D516A = D * 516 + 128; // Ux516
int E208A = E * 208 + 128; // Vx208
// Convert pixel 1.
int C = (Y1 - (16)) * 298;
int R = (C + E409A) >> 8;
if (R < 0) _out[2] = 0; else if (R > 255) _out[2] = 255; else _out[2] = (BYTE)R;
int G = (C - D100 - E208A) >> 8;
if (G < 0) _out[1] = 0; else if (G > 255) _out[1] = 255; else _out[1] = (BYTE)G;
int B = (C + D516A ) >> 8;
if (B < 0) _out[0] = 0; else if (B > 255) _out[0] = 255; else _out[0] = (BYTE)B;
_out += 4;
// Convert pixel 2.
C = (Y2 - (16)) * 298;
R = (C + E409A) >> 8;
if (R < 0) _out[2] = 0; else if (R > 255) _out[2] = 255; else _out[2] = (BYTE)R;
G = (C - D100 - E208A) >> 8;
if (G < 0) _out[1] = 0; else if (G > 255) _out[1] = 255; else _out[1] = (BYTE)G;
B = (C + D516A ) >> 8;
if (B < 0) _out[0] = 0; else if (B > 255) _out[0] = 255; else _out[0] = (BYTE)B;
_out += 4;
}
// 586+ MMX assembly, barr./niji (6 bits).
static unsigned __int64 _mmx_null = 0x0000000000000000;
static unsigned __int64 _mmx_csub = 0x0080001000800010;
//static unsigned __int64 _mmx_cmulA = 0xffcc004affe7004a; // -208,298,-100,298 // 6 bits MMX table
//static unsigned __int64 _mmx_cmulB = 0x0066000000810000; // 409,0,516,0
static unsigned __int64 _mmx_cmulA = 0xffe7004affcc004a; // -100,298,-208,298 // 6 bits MMX table
static unsigned __int64 _mmx_cmulB = 0x0081000000660000; // 516,0,409,0
static unsigned __int64 _mmx_caddA = 0xff80000000000000; // -128,0,0,0
static unsigned __int64 _mmx_caddB = 0x0080000000800000; // 128,0,128,0
static unsigned __int64 _mmx_cand = 0x00000000ffff0000;
static unsigned __int64 _mmx_candc1 = 0x000000000000ffff;
static unsigned __int64 _mmx_candc2 = 0x0000ffff00000000;
__asm
{
; preload constants
mov ecx,scanwidth
mov esi,yuv
ror ecx,2
mov edi,_out
cmp ecx,0
jz __early_out
__label_loop: ; process one scanline (4 pixels/loop (128 bits))
movd mm0,[esi]
movd mm4,[esi+4]
punpcklbw mm0,_mmx_null ; V_ Y2 U_ Y1
add esi,8
psubw mm0,_mmx_csub
punpcklbw mm4,_mmx_null
movq mm2,mm0
psubw mm4,_mmx_csub
pmullw mm0,_mmx_cmulA ; -E208 C2 -D100 C1
movq mm6,mm4
pmullw mm4,_mmx_cmulA
paddw mm0,_mmx_caddA ; -E208A C2 -D100 C1
pmullw mm2,_mmx_cmulB ; E409 x D516 x
paddw mm4,_mmx_caddA
pmullw mm6,_mmx_cmulB
paddw mm2,_mmx_caddB ; E409A x D516A x
paddw mm6,_mmx_caddB
movq mm1,mm0
psrlq mm2,16 ; x E409A x D516A
movq mm5,mm4
psrlq mm1,32 ; x x -E208A x
psrlq mm6,16 ; !
paddw mm1,mm0
psrlq mm5,32
paddw mm5,mm4
movq mm3,mm0
movq mm7,mm4
pand mm1,_mmx_cand ; x x (-E208A - D100) x
pand mm5,_mmx_cand
paddw mm2,mm1 ; x E409A (-E208A - D100) D516A
; extract (x C1 C1 C1) & (x C2 C2 C2) ...bottleneck...
paddw mm6,mm5
pand mm0,_mmx_candc1 ; x x x C1
pand mm4,_mmx_candc1
movq mm1,mm0
psllq mm0,16 ; x x C1 x
movq mm5,mm4
por mm0,mm1 ; x x C1 C1
psllq mm4,16 ; !
psllq mm0,16 ; x C1 C1 x
por mm4,mm5
psllq mm4,16
por mm0,mm1 ; x C1 C1 C1
pand mm3,_mmx_candc2 ; x C2 x x
por mm4,mm5
pand mm7,_mmx_candc2
movq mm1,mm3
movq mm5,mm7
psrlq mm1,16 ; x x C2 x
psrlq mm5,16 ; !
por mm1,mm3 ; x C2 C1 x
psrlq mm1,16 ; x x C2 C2
por mm5,mm7
por mm1,mm3 ; x C2 C2 C2
psrlq mm5,16
por mm5,mm7
paddw mm0,mm2 ; x r1<<6 g1<<6 b1<<6
;
paddw mm4,mm6
paddw mm1,mm2 ; x r2<<6 g2<<6 b2<<6
psraw mm0,6
paddw mm5,mm6
psraw mm4,6
add edi,16
psraw mm1,6 ; !
psraw mm5,6 ; !
packuswb mm0,mm1 ; x r2 g2 b2 x r1 g1 b1 (64 bits)
packuswb mm4,mm5
movq [edi-16],mm0
movq [edi-8],mm4
dec ecx
jnz __label_loop
__early_out: emms
}
pTargetByte += lTargetPitch;
pSourceByte += lSourcePitch;
}
}
break;
default:
{
// Unhandled FourCC code.
for (int y = 0; y < lNBLines; y++)
{
XC::MemASM::Copy(pTargetByte, pSourceByte + lXOffset, (size_t)pData->GetSourceRect().GetWidth() * 4);
pSourceByte += lSourcePitch;
pTargetByte += lTargetPitch;
}
break;
}
}
HWD3D_texture->UnlockRect(0);
}
return true;
} |
Partager