diff options
| -rw-r--r-- | gst/deinterlace2/tvtime/greedyh.asm | 475 | ||||
| -rw-r--r-- | gst/deinterlace2/tvtime/greedyh.c | 266 | ||||
| -rw-r--r-- | gst/deinterlace2/tvtime/greedyhmacros.h | 42 | 
3 files changed, 442 insertions, 341 deletions
diff --git a/gst/deinterlace2/tvtime/greedyh.asm b/gst/deinterlace2/tvtime/greedyh.asm index 8fd0ab66..fcd3a647 100644 --- a/gst/deinterlace2/tvtime/greedyh.asm +++ b/gst/deinterlace2/tvtime/greedyh.asm @@ -28,281 +28,216 @@  #include "x86-64_macros.inc" -void FUNCT_NAME( GstDeinterlace2 *object) +void +FUNCT_NAME (uint8_t * L1, uint8_t * L2, uint8_t * L3, uint8_t * L2P, +    uint8_t * Dest, int size)  { -    int64_t i; -    int InfoIsOdd = 0; -    // in tight loop some vars are accessed faster in local storage -    int64_t YMask        = 0x00ff00ff00ff00ffull; // to keep only luma -    int64_t UVMask       = 0xff00ff00ff00ff00ull; // to keep only chroma -    int64_t ShiftMask    = 0xfefffefffefffeffull; // to avoid shifting chroma to luma -    int64_t QW256        = 0x0100010001000100ull; // 4 256's - -    // Set up our two parms that are actually evaluated for each pixel -    i=GreedyMaxComb; -    int64_t MaxComb = i << 56 | i << 48 | i << 40 | i << 32 | i << 24 | i << 16 | i << 8 | i; -     -    i = GreedyMotionThreshold;		// scale to range of 0-257 -    int64_t MotionThreshold = i << 48 | i << 32 | i << 16 | i | UVMask; -     -    i = GreedyMotionSense;		// scale to range of 0-257 -    int64_t MotionSense = i << 48 | i << 32 | i << 16 | i; - -    int Line; -    long LoopCtr; -    unsigned int Pitch = object->field_stride; - -    unsigned char* L1;					// ptr to Line1, of 3 -    unsigned char* L2;					// ptr to Line2, the weave line -    unsigned char* L3;					// ptr to Line3 - -    unsigned char* L2P;					// ptr to prev Line2 -    unsigned char* Dest = GST_BUFFER_DATA(object->out_buf); - -    int64_t QW256B; -    int64_t LastAvg=0;			//interp value from left qword - -    i = 0xffffffff - 256; -    QW256B =  i << 48 |  i << 32 | i << 16 | i;  // save a couple instr on PMINSW instruct. - - -    // copy first even line no matter what, and the first odd line if we're -    // processing an EVEN field. (note diff from other deint rtns.) - -    if (object->field_history[object->history_count-1].flags == PICTURE_INTERLACED_BOTTOM) { -      InfoIsOdd = 1; - -      L1 = GST_BUFFER_DATA(object->field_history[object->history_count-2].buf); -      L2 = GST_BUFFER_DATA(object->field_history[object->history_count-1].buf);  -      L3 = L1 + Pitch; -      L2P = GST_BUFFER_DATA(object->field_history[object->history_count-3].buf); - -      // copy first even line -      object->pMemcpy(Dest, L1, object->line_length); -      Dest += object->output_stride; -    }  -    else { -      InfoIsOdd = 0; -      L1 = GST_BUFFER_DATA(object->field_history[object->history_count-2].buf); -      L2 = GST_BUFFER_DATA(object->field_history[object->history_count-1].buf) + Pitch;  -      L3 = L1 + Pitch; -      L2P = GST_BUFFER_DATA(object->field_history[object->history_count-3].buf) + Pitch; - -      // copy first even line -      object->pMemcpy(Dest, GST_BUFFER_DATA(object->field_history[0].buf), object->line_length); -      Dest += object->output_stride; -      // then first odd line -      object->pMemcpy(Dest, L1, object->line_length); -      Dest += object->output_stride; -    } - - -    long oldbx; - -    for (Line = 0; Line < (object->field_height - 1); ++Line) { -        LoopCtr = object->line_length / 8 - 1; // there are LineLength / 8 qwords per line but do 1 less, adj at end of loop - -        // For ease of reading, the comments below assume that we're operating on an odd -        // field (i.e., that InfoIsOdd is true).  Assume the obvious for even lines.. -        __asm__ __volatile__ -            ( -             // save ebx (-fPIC) -	     MOVX" %%"XBX", %[oldbx]\n\t" - -             MOVX"  %[L1],          %%"XAX"\n\t" -             LEAX"  8(%%"XAX"),     %%"XBX"\n\t"    // next qword needed by DJR -             MOVX"  %[L3],          %%"XCX"\n\t" -             SUBX"  %%"XAX",        %%"XCX"\n\t"    // carry L3 addr as an offset -             MOVX"  %[L2P],         %%"XDX"\n\t" -             MOVX"  %[L2],          %%"XSI"\n\t" -             MOVX"  %[Dest],        %%"XDI"\n\t"    // DL1 if Odd or DL2 if Even - -             ".align 8\n\t" -             "1:\n\t" - -             "movq  (%%"XSI"),      %%mm0\n\t"      // L2 - the newest weave pixel value -             "movq  (%%"XAX"),      %%mm1\n\t"      // L1 - the top pixel -             "movq  (%%"XDX"),      %%mm2\n\t"      // L2P - the prev weave pixel -             "movq  (%%"XAX", %%"XCX"), %%mm3\n\t"  // L3, next odd row -             "movq  %%mm1,          %%mm6\n\t"      // L1 - get simple single pixel interp -             //	pavgb   mm6, mm3                    // use macro below -             V_PAVGB ("%%mm6", "%%mm3", "%%mm4", "%[ShiftMask]") - -             // DJR - Diagonal Jaggie Reduction -             // In the event that we are going to use an average (Bob) pixel we do not want a jagged -             // stair step effect.  To combat this we avg in the 2 horizontally adjacen pixels into the -             // interpolated Bob mix. This will do horizontal smoothing for only the Bob'd pixels. - -             "movq  %[LastAvg],     %%mm4\n\t"      // the bob value from prev qword in row -             "movq  %%mm6,          %[LastAvg]\n\t" // save for next pass -             "psrlq $48,            %%mm4\n\t"      // right justify 1 pixel -             "movq  %%mm6,          %%mm7\n\t"      // copy of simple bob pixel -             "psllq $16,            %%mm7\n\t"      // left justify 3 pixels -             "por   %%mm7,          %%mm4\n\t"      // and combine - -             "movq  (%%"XBX"),      %%mm5\n\t"      // next horiz qword from L1 -             //			pavgb   mm5, qword ptr[ebx+ecx] // next horiz qword from L3, use macro below -             V_PAVGB ("%%mm5", "(%%"XBX",%%"XCX")", "%%mm7", "%[ShiftMask]") -             "psllq $48,            %%mm5\n\t"      // left just 1 pixel -             "movq  %%mm6,          %%mm7\n\t"      // another copy of simple bob pixel -             "psrlq $16,            %%mm7\n\t"      // right just 3 pixels -             "por   %%mm7,          %%mm5\n\t"      // combine -             //			pavgb	mm4, mm5			// avg of forward and prev by 1 pixel, use macro -             V_PAVGB ("%%mm4", "%%mm5", "%%mm5", "%[ShiftMask]")   // mm5 gets modified if MMX -             //			pavgb	mm6, mm4			// avg of center and surround interp vals, use macro -             V_PAVGB ("%%mm6", "%%mm4", "%%mm7", "%[ShiftMask]") - -             // Don't do any more averaging than needed for mmx. It hurts performance and causes rounding errors. +  // in tight loop some vars are accessed faster in local storage +  int64_t YMask = 0x00ff00ff00ff00ffull;        // to keep only luma +  int64_t UVMask = 0xff00ff00ff00ff00ull;       // to keep only chroma +  int64_t ShiftMask = 0xfefefefefefefefeull;    // to avoid shifting chroma to luma +  int64_t QW256 = 0x0100010001000100ull;        // 4 256's +  int64_t MaxComb; +  int64_t MotionThreshold; +  int64_t MotionSense; +  int64_t i; +  long LoopCtr; +  long oldbx; + +  int64_t QW256B; +  int64_t LastAvg = 0;          //interp value from left qword + +  // Set up our two parms that are actually evaluated for each pixel +  i = GreedyMaxComb; +  MaxComb = +      i << 56 | i << 48 | i << 40 | i << 32 | i << 24 | i << 16 | i << 8 | i; + +  i = GreedyMotionThreshold;    // scale to range of 0-257 +  MotionThreshold = i << 48 | i << 32 | i << 16 | i | UVMask; + +  i = GreedyMotionSense;        // scale to range of 0-257 +  MotionSense = i << 48 | i << 32 | i << 16 | i; + +  i = 0xffffffff - 256; +  QW256B = i << 48 | i << 32 | i << 16 | i;     // save a couple instr on PMINSW instruct. + +  LoopCtr = size / 8 - 1;       // there are LineLength / 8 qwords per line but do 1 less, adj at end of loop + +  // For ease of reading, the comments below assume that we're operating on an odd +  // field (i.e., that InfoIsOdd is true).  Assume the obvious for even lines.. +  __asm__ __volatile__ ( +      // save ebx (-fPIC) +      MOVX " %%" XBX ", %[oldbx]\n\t" +      MOVX "  %[L1],          %%" XAX "\n\t" +      LEAX "  8(%%" XAX "),     %%" XBX "\n\t"   // next qword needed by DJR +      MOVX "  %[L3],          %%" XCX "\n\t" +      SUBX "  %%" XAX ",        %%" XCX "\n\t"   // carry L3 addr as an offset +      MOVX "  %[L2P],         %%" XDX "\n\t" +      MOVX "  %[L2],          %%" XSI "\n\t" +      MOVX "  %[Dest],        %%" XDI "\n\t"      // DL1 if Odd or DL2 if Even + +      ".align 8\n\t" +      "1:\n\t" +      "movq  (%%" XSI "),      %%mm0\n\t"       // L2 - the newest weave pixel value +      "movq  (%%" XAX "),      %%mm1\n\t"       // L1 - the top pixel +      "movq  (%%" XDX "),      %%mm2\n\t"       // L2P - the prev weave pixel +      "movq  (%%" XAX ", %%" XCX "), %%mm3\n\t" // L3, next odd row +      "movq  %%mm1,          %%mm6\n\t"         // L1 - get simple single pixel interp + +      //        pavgb   mm6, mm3                    // use macro below +      V_PAVGB ("%%mm6", "%%mm3", "%%mm4", "%[ShiftMask]") + +      // DJR - Diagonal Jaggie Reduction +      // In the event that we are going to use an average (Bob) pixel we do not want a jagged +      // stair step effect.  To combat this we avg in the 2 horizontally adjacen pixels into the +      // interpolated Bob mix. This will do horizontal smoothing for only the Bob'd pixels. + +      "movq  %[LastAvg],     %%mm4\n\t" // the bob value from prev qword in row +      "movq  %%mm6,          %[LastAvg]\n\t"    // save for next pass +      "psrlq $48,            %%mm4\n\t" // right justify 1 pixel +      "movq  %%mm6,          %%mm7\n\t" // copy of simple bob pixel +      "psllq $16,            %%mm7\n\t" // left justify 3 pixels +      "por   %%mm7,          %%mm4\n\t" // and combine +      "movq  (%%" XBX "),      %%mm5\n\t"       // next horiz qword from L1 +      // pavgb   mm5, qword ptr[ebx+ecx] // next horiz qword from L3, use macro below + +      V_PAVGB ("%%mm5", "(%%" XBX ",%%" XCX ")", "%%mm7", "%[ShiftMask]") +      "psllq $48,            %%mm5\n\t" // left just 1 pixel +      "movq  %%mm6,          %%mm7\n\t" // another copy of simple bob pixel +      "psrlq $16,            %%mm7\n\t" // right just 3 pixels +      "por   %%mm7,          %%mm5\n\t" // combine +      // pavgb        mm4, mm5                        // avg of forward and prev by 1 pixel, use macro +      V_PAVGB ("%%mm4", "%%mm5", "%%mm5", "%[ShiftMask]")       // mm5 gets modified if MMX +      //                        pavgb        mm6, mm4                        // avg of center and surround interp vals, use macro +      V_PAVGB ("%%mm6", "%%mm4", "%%mm7", "%[ShiftMask]") + +      // Don't do any more averaging than needed for mmx. It hurts performance and causes rounding errors.  #ifndef IS_MMX -             //          pavgb	mm4, mm6			// 1/4 center, 3/4 adjacent -             V_PAVGB ("%%mm4", "%%mm6", "%%mm7", "%[ShiftMask]") -             //    		pavgb	mm6, mm4			// 3/8 center, 5/8 adjacent -             V_PAVGB ("%%mm6", "%%mm4", "%%mm7", "%[ShiftMask]") +      //          pavgb        mm4, mm6                        // 1/4 center, 3/4 adjacent +      V_PAVGB ("%%mm4", "%%mm6", "%%mm7", "%[ShiftMask]") +      //                    pavgb        mm6, mm4                        // 3/8 center, 5/8 adjacent +      V_PAVGB ("%%mm6", "%%mm4", "%%mm7", "%[ShiftMask]")  #endif -             // get abs value of possible L2 comb -             "movq    %%mm6,        %%mm4\n\t"      // work copy of interp val -             "movq    %%mm2,        %%mm7\n\t"      // L2 -             "psubusb %%mm4,        %%mm7\n\t"      // L2 - avg -             "movq    %%mm4,        %%mm5\n\t"      // avg -             "psubusb %%mm2,        %%mm5\n\t"      // avg - L2 -             "por     %%mm7,        %%mm5\n\t"      // abs(avg-L2) - -             // get abs value of possible L2P comb -             "movq    %%mm0,        %%mm7\n\t"      // L2P -             "psubusb %%mm4,        %%mm7\n\t"      // L2P - avg -             "psubusb %%mm0,        %%mm4\n\t"      // avg - L2P -             "por     %%mm7,        %%mm4\n\t"      // abs(avg-L2P) - -             // use L2 or L2P depending upon which makes smaller comb -             "psubusb %%mm5,        %%mm4\n\t"      // see if it goes to zero -             "psubusb %%mm5,        %%mm5\n\t"      // 0 -             "pcmpeqb %%mm5,        %%mm4\n\t"      // if (mm4=0) then FF else 0 -             "pcmpeqb %%mm4,        %%mm5\n\t"      // opposite of mm4 - -             // if Comb(L2P) <= Comb(L2) then mm4=ff, mm5=0 else mm4=0, mm5 = 55 -             "pand    %%mm2,        %%mm5\n\t"      // use L2 if mm5 == ff, else 0 -             "pand    %%mm0,        %%mm4\n\t"      // use L2P if mm4 = ff, else 0 -             "por     %%mm5,        %%mm4\n\t"      // may the best win - -             // Inventory: at this point we have the following values: -             // mm0 = L2P (or L2) -             // mm1 = L1 -             // mm2 = L2 (or L2P) -             // mm3 = L3 -             // mm4 = the best of L2,L2P weave pixel, base upon comb -             // mm6 = the avg interpolated value, if we need to use it - -             // Let's measure movement, as how much the weave pixel has changed -             "movq    %%mm2,        %%mm7\n\t" -             "psubusb %%mm0,        %%mm2\n\t" -             "psubusb %%mm7,        %%mm0\n\t" -             "por     %%mm2,        %%mm0\n\t"      // abs value of change, used later - -             // Now lets clip our chosen value to be not outside of the range -             // of the high/low range L1-L3 by more than MaxComb. -             // This allows some comb but limits the damages and also allows more -             // detail than a boring oversmoothed clip. -             "movq    %%mm1,        %%mm2\n\t"      // copy L1 -             //	pmaxub mm2, mm3                     // use macro -             V_PMAXUB ("%%mm2", "%%mm3")            // now = Max(L1,L3) -             "movq    %%mm1,        %%mm5\n\t"      // copy L1 -             // pminub	mm5, mm3                    // now = Min(L1,L3), use macro -             V_PMINUB ("%%mm5", "%%mm3", "%%mm7") -             // allow the value to be above the high or below the low by amt of MaxComb -             "psubusb %[MaxComb],   %%mm5\n\t"      // lower min by diff -             "paddusb %[MaxComb],   %%mm2\n\t"      // increase max by diff -             // pmaxub	mm4, mm5                    // now = Max(best,Min(L1,L3) use macro -             V_PMAXUB ("%%mm4", "%%mm5") -             // pminub	mm4, mm2                    // now = Min( Max(best, Min(L1,L3), L2 )=L2 clipped -             V_PMINUB ("%%mm4", "%%mm2", "%%mm7") - -             // Blend weave pixel with bob pixel, depending on motion val in mm0 -             "psubusb %[MotionThreshold], %%mm0\n\t"// test Threshold, clear chroma change >>>?? -             "pmullw  %[MotionSense], %%mm0\n\t"    // mul by user factor, keep low 16 bits -             "movq    %[QW256], %%mm7\n\t" -#ifdef HAVE_SSE -             "pminsw  %%mm7,        %%mm0\n\t"      // max = 256 +      // get abs value of possible L2 comb +      "movq    %%mm6,        %%mm4\n\t" // work copy of interp val +      "movq    %%mm2,        %%mm7\n\t" // L2 +      "psubusb %%mm4,        %%mm7\n\t" // L2 - avg +      "movq    %%mm4,        %%mm5\n\t" // avg +      "psubusb %%mm2,        %%mm5\n\t" // avg - L2 +      "por     %%mm7,        %%mm5\n\t" // abs(avg-L2) + +      // get abs value of possible L2P comb +      "movq    %%mm0,        %%mm7\n\t" // L2P +      "psubusb %%mm4,        %%mm7\n\t" // L2P - avg +      "psubusb %%mm0,        %%mm4\n\t" // avg - L2P +      "por     %%mm7,        %%mm4\n\t" // abs(avg-L2P) + +      // use L2 or L2P depending upon which makes smaller comb +      "psubusb %%mm5,        %%mm4\n\t" // see if it goes to zero +      "psubusb %%mm5,        %%mm5\n\t" // 0 +      "pcmpeqb %%mm5,        %%mm4\n\t" // if (mm4=0) then FF else 0 +      "pcmpeqb %%mm4,        %%mm5\n\t" // opposite of mm4 + +      // if Comb(L2P) <= Comb(L2) then mm4=ff, mm5=0 else mm4=0, mm5 = 55 +      "pand    %%mm2,        %%mm5\n\t" // use L2 if mm5 == ff, else 0 +      "pand    %%mm0,        %%mm4\n\t" // use L2P if mm4 = ff, else 0 +      "por     %%mm5,        %%mm4\n\t" // may the best win + +      // Inventory: at this point we have the following values: +      // mm0 = L2P (or L2) +      // mm1 = L1 +      // mm2 = L2 (or L2P) +      // mm3 = L3 +      // mm4 = the best of L2,L2P weave pixel, base upon comb +      // mm6 = the avg interpolated value, if we need to use it +      // Let's measure movement, as how much the weave pixel has changed + +      "movq    %%mm2,        %%mm7\n\t" +      "psubusb %%mm0,        %%mm2\n\t" +      "psubusb %%mm7,        %%mm0\n\t" +      "por     %%mm2,        %%mm0\n\t"   // abs value of change, used later + +      // Now lets clip our chosen value to be not outside of the range +      // of the high/low range L1-L3 by more than MaxComb. +      // This allows some comb but limits the damages and also allows more +      // detail than a boring oversmoothed clip. + +      "movq    %%mm1,        %%mm2\n\t" // copy L1 +      // pmaxub mm2, mm3                     // use macro +      V_PMAXUB ("%%mm2", "%%mm3")       // now = Max(L1,L3) +      "movq    %%mm1,        %%mm5\n\t" // copy L1 +      // pminub        mm5, mm3                    // now = Min(L1,L3), use macro +      V_PMINUB ("%%mm5", "%%mm3", "%%mm7") + +      // allow the value to be above the high or below the low by amt of MaxComb +      "psubusb %[MaxComb],   %%mm5\n\t" // lower min by diff +      "paddusb %[MaxComb],   %%mm2\n\t" // increase max by diff +      // pmaxub        mm4, mm5         // now = Max(best,Min(L1,L3) use macro +      V_PMAXUB ("%%mm4", "%%mm5") +      // pminub        mm4, mm2         // now = Min( Max(best, Min(L1,L3), L2 )=L2 clipped +      V_PMINUB ("%%mm4", "%%mm2", "%%mm7") + +      // Blend weave pixel with bob pixel, depending on motion val in mm0 +      "psubusb %[MotionThreshold], %%mm0\n\t"   // test Threshold, clear chroma change >>>?? +      "pmullw  %[MotionSense], %%mm0\n\t"       // mul by user factor, keep low 16 bits +      "movq    %[QW256], %%mm7\n\t" +#if SIMD_TYPE == MMXEXT +      "pminsw  %%mm7,        %%mm0\n\t" // max = 256  #else -             "paddusw %[QW256B],    %%mm0\n\t"      // add, may sat at fff.. -             "psubusw %[QW256B],    %%mm0\n\t"      // now = Min(L1,256) -#endif -             "psubusw %%mm0,        %%mm7\n\t"      // so the 2 sum to 256, weighted avg -             "movq    %%mm4,        %%mm2\n\t"      // save weave chroma info before trashing -             "pand    %[YMask],     %%mm4\n\t"      // keep only luma from calc'd value -             "pmullw  %%mm7,        %%mm4\n\t"      // use more weave for less motion -             "pand    %[YMask],     %%mm6\n\t"      // keep only luma from calc'd value -             "pmullw  %%mm0,        %%mm6\n\t"      // use more bob for large motion -             "paddusw %%mm6,        %%mm4\n\t"      // combine -             "psrlw   $8,           %%mm4\n\t"      // div by 256 to get weighted avg - -             // chroma comes from weave pixel -             "pand    %[UVMask],    %%mm2\n\t"      // keep chroma -             "por     %%mm4,        %%mm2\n\t"      // and combine - -             V_MOVNTQ ("(%%"XDI")", "%%mm2")        // move in our clipped best, use macro - -             // bump ptrs and loop -             LEAX"    8(%%"XAX"),   %%"XAX"\n\t" -             LEAX"    8(%%"XBX"),   %%"XBX"\n\t" -             LEAX"    8(%%"XDX"),   %%"XDX"\n\t" -             LEAX"    8(%%"XDI"),   %%"XDI"\n\t" -             LEAX"    8(%%"XSI"),   %%"XSI"\n\t" -             DECX"    %[LoopCtr]\n\t" -             "jg      1b\n\t"                       // loop if not to last line -                                                    // note P-III default assumes backward branches taken -             "jl      1f\n\t"                       // done -             MOVX"    %%"XAX",      %%"XBX"\n\t"  // sharpness lookahead 1 byte only, be wrong on 1 -             "jmp     1b\n\t" - -             "1:\n\t" -	     MOVX" %[oldbx], %%"XBX"\n\t" - -             : /* no outputs */ - -             : [LastAvg]         "m"(LastAvg), -               [L1]              "m"(L1), -               [L3]              "m"(L3), -               [L2P]             "m"(L2P), -               [L2]              "m"(L2), -               [Dest]            "m"(Dest), -               [ShiftMask]       "m"(ShiftMask), -               [MaxComb]         "m"(MaxComb), -               [MotionThreshold] "m"(MotionThreshold), -               [MotionSense]     "m"(MotionSense), -               [QW256B]          "m"(QW256B), -               [YMask]           "m"(YMask), -               [UVMask]          "m"(UVMask), -               [LoopCtr]         "m"(LoopCtr), -               [QW256]           "m"(QW256), -	       [oldbx]           "m"(oldbx) - -             : XAX, XCX, XDX, XSI, XDI, -#ifdef HAVE_CPU_I386 -               "st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)", -#endif -               /* FIXME: breaks unless compiling with -mmmx -	          "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7", */ -               "memory", "cc" -            ); - -        Dest += object->output_stride; -        object->pMemcpy(Dest, L3, object->line_length); -        Dest += object->output_stride; - -        L1  += Pitch; -        L2  += Pitch; -        L3  += Pitch; -        L2P += Pitch; -	} - -    if (InfoIsOdd) { -        object->pMemcpy(Dest, L2, object->line_length); -    } - -    // clear out the MMX registers ready for doing floating point again -#ifdef HAVE_CPU_I386 -    __asm__ __volatile__ ("emms\n\t"); +      "paddusw %[QW256B],    %%mm0\n\t" // add, may sat at fff.. +      "psubusw %[QW256B],    %%mm0\n\t" // now = Min(L1,256)  #endif +      "psubusw %%mm0,        %%mm7\n\t" // so the 2 sum to 256, weighted avg +      "movq    %%mm4,        %%mm2\n\t" // save weave chroma info before trashing +      "pand    %[YMask],     %%mm4\n\t" // keep only luma from calc'd value +      "pmullw  %%mm7,        %%mm4\n\t" // use more weave for less motion +      "pand    %[YMask],     %%mm6\n\t" // keep only luma from calc'd value +      "pmullw  %%mm0,        %%mm6\n\t" // use more bob for large motion +      "paddusw %%mm6,        %%mm4\n\t" // combine +      "psrlw   $8,           %%mm4\n\t" // div by 256 to get weighted avg +      // chroma comes from weave pixel +      "pand    %[UVMask],    %%mm2\n\t" // keep chroma +      "por     %%mm4,        %%mm2\n\t" // and combine +      V_MOVNTQ ("(%%" XDI ")", "%%mm2") // move in our clipped best, use macro +      // bump ptrs and loop +      LEAX "    8(%%" XAX "),   %%" XAX "\n\t" +      LEAX "    8(%%" XBX "),   %%" XBX "\n\t" +      LEAX "    8(%%" XDX "),   %%" XDX "\n\t" +      LEAX "    8(%%" XDI "),   %%" XDI "\n\t" +      LEAX "    8(%%" XSI "),   %%" XSI "\n\t" +      DECX "    %[LoopCtr]\n\t" +       +      "jg      1b\n\t"   // loop if not to last line +      // note P-III default assumes backward branches taken +      "jl      1f\n\t"          // done +      MOVX "    %%" XAX ",      %%" XBX "\n\t"  // sharpness lookahead 1 byte only, be wrong on 1 +      "jmp     1b\n\t" +       +      "1:\n\t"       +      MOVX " %[oldbx], %%" XBX "\n\t" +      "emms\n\t":     /* no outputs */ + +      :[LastAvg] "m" (LastAvg), +       [L1] "m" (L1), +       [L3] "m" (L3), +       [L2P] "m" (L2P), +       [L2] "m" (L2), +       [Dest] "m" (Dest), +       [ShiftMask] "m" (ShiftMask), +       [MaxComb] "m" (MaxComb), +       [MotionThreshold] "m" (MotionThreshold), +       [MotionSense] "m" (MotionSense), +       [QW256B] "m" (QW256B), +       [YMask] "m" (YMask), +       [UVMask] "m" (UVMask), +       [LoopCtr] "m" (LoopCtr), +       [QW256] "m" (QW256), +       [oldbx] "m" (oldbx) +      : XAX, XCX, XDX, XSI, XDI, +      "st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)", +      /* FIXME: breaks unless compiling with -mmmx +         "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7", */ +      "memory", "cc");  } diff --git a/gst/deinterlace2/tvtime/greedyh.c b/gst/deinterlace2/tvtime/greedyh.c index 623c2d8b..f9d33e74 100644 --- a/gst/deinterlace2/tvtime/greedyh.c +++ b/gst/deinterlace2/tvtime/greedyh.c @@ -41,51 +41,244 @@  #include "gstdeinterlace2.h"  #include "speedy.h" +static const unsigned int GreedyMaxComb = 5; +static const unsigned int GreedyMotionThreshold = 25; +static const unsigned int GreedyMotionSense = 30; -#define MAXCOMB_DEFAULT          5 -#define MOTIONTHRESHOLD_DEFAULT 25 -#define MOTIONSENSE_DEFAULT     30 +void +greedyDScaler_C (uint8_t * L1, uint8_t * L2, uint8_t * L3, uint8_t * L2P, +    uint8_t * Dest, int size) +{ +  int Pos; +  uint8_t l1_l, l1_1_l, l3_l, l3_1_l; +  uint8_t l1_c, l1_1_c, l3_c, l3_1_c; +  uint8_t avg_l, avg_c, avg_l_1, avg_c_1; +  uint8_t avg_l__1 = 0, avg_c__1 = 0; +  uint8_t avg_s_l, avg_s_c; +  uint8_t avg_sc_l, avg_sc_c; +  uint8_t best_l, best_c; +  uint16_t mov_l; +  uint8_t out_l, out_c; +  uint8_t l2_l, l2_c, lp2_l, lp2_c; +  uint8_t l2_l_diff, l2_c_diff, lp2_l_diff, lp2_c_diff; +  uint8_t min_l, min_c, max_l, max_c; + +  for (Pos = 0; Pos < size; Pos += 2) { +    l1_l = L1[0]; +    l1_c = L1[1]; +    l3_l = L3[0]; +    l3_c = L3[1]; + +    if (Pos == size - 1) { +      l1_1_l = l1_l; +      l1_1_c = l1_c; +      l3_1_l = l3_l; +      l3_1_c = l3_c; +    } else { +      l1_1_l = L1[2]; +      l1_1_c = L1[3]; +      l3_1_l = L3[2]; +      l3_1_c = L3[3]; +    } + +    /* Average of L1 and L3 */ +    avg_l = (l1_l + l3_l) / 2; +    avg_c = (l1_c + l3_c) / 2; + +    /* Average of next L1 and next L3 */ +    avg_l_1 = (l1_1_l + l3_1_l) / 2; +    avg_c_1 = (l1_1_c + l3_1_c) / 2; + +    /* Calculate average of one pixel forward and previous */ +    avg_s_l = (avg_l__1 + avg_l_1) / 2; +    avg_s_c = (avg_c__1 + avg_c_1) / 2; + +    /* Calculate average of center and surrounding pixels */ +    avg_sc_l = (avg_l + avg_s_l) / 2; +    avg_sc_c = (avg_c + avg_s_c) / 2; + +    /* move forward */ +    avg_l__1 = avg_l; +    avg_c__1 = avg_c; + +    /* Get best L2/L2P, i.e. least diff from above average */ +    l2_l = L2[0]; +    l2_c = L2[1]; +    lp2_l = L2P[0]; +    lp2_c = L2P[1]; + +    l2_l_diff = ABS (l2_l - avg_sc_l); +    l2_c_diff = ABS (l2_c - avg_sc_c); + +    lp2_l_diff = ABS (lp2_l - avg_sc_l); +    lp2_c_diff = ABS (lp2_c - avg_sc_c); + +    if (l2_l_diff > lp2_l_diff) +      best_l = lp2_l; +    else +      best_l = l2_l; + +    if (l2_c_diff > lp2_c_diff) +      best_c = lp2_c; +    else +      best_c = l2_c; + +    /* Clip this best L2/L2P by L1/L3 and allow to differ by GreedyMaxComb */ +    max_l = MAX (l1_l, l3_l); +    min_l = MIN (l1_l, l3_l); -unsigned int GreedyMaxComb; +    if (max_l < 256 - GreedyMaxComb) +      max_l += GreedyMaxComb; +    else +      max_l = 255; -unsigned int GreedyMotionThreshold; +    if (min_l > GreedyMaxComb) +      min_l -= GreedyMaxComb; +    else +      min_l = 0; -unsigned int GreedyMotionSense; +    max_c = MAX (l1_c, l3_c); +    min_c = MIN (l1_c, l3_c); +    if (max_c < 256 - GreedyMaxComb) +      max_c += GreedyMaxComb; +    else +      max_c = 255; -#define IS_SSE -#define SSE_TYPE SSE -#define FUNCT_NAME greedyDScaler_SSE +    if (min_c > GreedyMaxComb) +      min_c -= GreedyMaxComb; +    else +      min_c = 0; + +    out_l = CLAMP (best_l, min_l, max_l); +    out_c = CLAMP (best_c, min_c, max_c); + +    /* Do motion compensation for luma, i.e. how much +     * the weave pixel differs */ +    mov_l = ABS (l2_l - lp2_l); +    if (mov_l > GreedyMotionThreshold) +      mov_l -= GreedyMotionThreshold; +    else +      mov_l = 0; + +    mov_l = mov_l * GreedyMotionSense; +    if (mov_l > 256) +      mov_l = 256; + +    /* Weighted sum on clipped weave pixel and average */ +    out_l = (out_l * (256 - mov_l) + avg_sc_l * mov_l) / 256; + +    Dest[0] = out_l; +    Dest[1] = out_c; + +    Dest += 2; +    L1 += 2; +    L2 += 2; +    L3 += 2; +    L2P += 2; +  } +} + +#define IS_MMXEXT +#define SIMD_TYPE MMXEXT +#define FUNCT_NAME greedyDScaler_MMXEXT  #include "greedyh.asm" -#undef SSE_TYPE -#undef IS_SSE +#undef SIMD_TYPE +#undef IS_MMXEXT  #undef FUNCT_NAME -#define IS_3DNOW +#define IS_TDNOW +#define SIMD_TYPE TDNOW  #define FUNCT_NAME greedyDScaler_3DNOW -#define SSE_TYPE 3DNOW  #include "greedyh.asm" -#undef SSE_TYPE -#undef IS_3DNOW +#undef SIMD_TYPE +#undef IS_TDNOW  #undef FUNCT_NAME  #define IS_MMX -#define SSE_TYPE MMX +#define SIMD_TYPE MMX  #define FUNCT_NAME greedyDScaler_MMX  #include "greedyh.asm" -#undef SSE_TYPE +#undef SIMD_TYPE  #undef IS_MMX  #undef FUNCT_NAME -void +static void  deinterlace_frame_di_greedyh (GstDeinterlace2 * object)  { -  if (object->cpu_feature_flags & OIL_IMPL_FLAG_SSE) { -    greedyh_filter_sse (object); +  void (*func) (uint8_t * L1, uint8_t * L2, uint8_t * L3, uint8_t * L2P, +      uint8_t * Dest, int size); + +  int InfoIsOdd = 0; +  int Line; +  unsigned int Pitch = object->field_stride; + +  unsigned char *L1;            // ptr to Line1, of 3 +  unsigned char *L2;            // ptr to Line2, the weave line +  unsigned char *L3;            // ptr to Line3 + +  unsigned char *L2P;           // ptr to prev Line2 +  unsigned char *Dest = GST_BUFFER_DATA (object->out_buf); + +  if (object->cpu_feature_flags & OIL_IMPL_FLAG_MMXEXT) { +    func = greedyDScaler_MMXEXT;    } else if (object->cpu_feature_flags & OIL_IMPL_FLAG_3DNOW) { -    greedyh_filter_3dnow (object); +    func = greedyDScaler_3DNOW; +  } else if (object->cpu_feature_flags & OIL_IMPL_FLAG_MMX) { +    func = greedyDScaler_MMX;    } else { -    greedyh_filter_mmx (object); +    func = greedyDScaler_C; +  } + +  // copy first even line no matter what, and the first odd line if we're +  // processing an EVEN field. (note diff from other deint rtns.) + +  if (object->field_history[object->history_count - 1].flags == +      PICTURE_INTERLACED_BOTTOM) { +    InfoIsOdd = 1; + +    L1 = GST_BUFFER_DATA (object->field_history[object->history_count - 2].buf); +    L2 = GST_BUFFER_DATA (object->field_history[object->history_count - 1].buf); +    L3 = L1 + Pitch; +    L2P = +        GST_BUFFER_DATA (object->field_history[object->history_count - 3].buf); + +    // copy first even line +    object->pMemcpy (Dest, L1, object->line_length); +    Dest += object->output_stride; +  } else { +    InfoIsOdd = 0; +    L1 = GST_BUFFER_DATA (object->field_history[object->history_count - 2].buf); +    L2 = GST_BUFFER_DATA (object->field_history[object->history_count - +            1].buf) + Pitch; +    L3 = L1 + Pitch; +    L2P = +        GST_BUFFER_DATA (object->field_history[object->history_count - 3].buf) + +        Pitch; + +    // copy first even line +    object->pMemcpy (Dest, GST_BUFFER_DATA (object->field_history[0].buf), +        object->line_length); +    Dest += object->output_stride; +    // then first odd line +    object->pMemcpy (Dest, L1, object->line_length); +    Dest += object->output_stride; +  } + +  for (Line = 0; Line < (object->field_height - 1); ++Line) { +    func (L1, L2, L3, L2P, Dest, object->line_length); +    Dest += object->output_stride; +    object->pMemcpy (Dest, L3, object->line_length); +    Dest += object->output_stride; + +    L1 += Pitch; +    L2 += Pitch; +    L3 += Pitch; +    L2P += Pitch; +  } + +  if (InfoIsOdd) { +    object->pMemcpy (Dest, L2, object->line_length);    }  } @@ -94,7 +287,7 @@ static deinterlace_method_t greedyh_method = {    "Motion Adaptive: Advanced Detection",    "AdaptiveAdvanced",    4, -  OIL_IMPL_FLAG_MMX, +  0,    0,    0,    0, @@ -117,32 +310,5 @@ static deinterlace_method_t greedyh_method = {  deinterlace_method_t *  dscaler_greedyh_get_method (void)  { -  greedyh_init ();    return &greedyh_method;  } - -void -greedyh_init (void) -{ -  GreedyMaxComb = MAXCOMB_DEFAULT; -  GreedyMotionThreshold = MOTIONTHRESHOLD_DEFAULT; -  GreedyMotionSense = MOTIONSENSE_DEFAULT; -} - -void -greedyh_filter_mmx (GstDeinterlace2 * object) -{ -  greedyDScaler_MMX (object); -} - -void -greedyh_filter_3dnow (GstDeinterlace2 * object) -{ -  greedyDScaler_3DNOW (object); -} - -void -greedyh_filter_sse (GstDeinterlace2 * object) -{ -  greedyDScaler_SSE (object); -} diff --git a/gst/deinterlace2/tvtime/greedyhmacros.h b/gst/deinterlace2/tvtime/greedyhmacros.h index 5f65959c..3f1c72c9 100644 --- a/gst/deinterlace2/tvtime/greedyhmacros.h +++ b/gst/deinterlace2/tvtime/greedyhmacros.h @@ -21,7 +21,7 @@  // BEFORE USING THESE YOU MUST SET: -// #define SSE_TYPE SSE            (or MMX or 3DNOW) +// #define SIMD_TYPE MMXEXT            (or MMX or TDNOW)  // some macros for pavgb instruction  //      V_PAVGB(mmr1, mmr2, mmr work register, smask) mmr2 may = mmrw if you can trash it @@ -33,21 +33,21 @@  	"pand    "smask", "mmr1"\n\t"            \  	"psrlw   $1,      "mmr1"\n\t"            \  	"paddusb "mmrw",  "mmr1"\n\t" -#define V_PAVGB_SSE(mmr1, mmr2, mmrw, smask)      "pavgb   "mmr2", "mmr1"\n\t" -#define V_PAVGB_3DNOW(mmr1, mmr2, mmrw, smask)    "pavgusb "mmr2", "mmr1"\n\t" -#define V_PAVGB(mmr1, mmr2, mmrw, smask)          V_PAVGB2(mmr1, mmr2, mmrw, smask, SSE_TYPE)  -#define V_PAVGB2(mmr1, mmr2, mmrw, smask, ssetyp) V_PAVGB3(mmr1, mmr2, mmrw, smask, ssetyp)  -#define V_PAVGB3(mmr1, mmr2, mmrw, smask, ssetyp) V_PAVGB_##ssetyp(mmr1, mmr2, mmrw, smask)  +#define V_PAVGB_MMXEXT(mmr1, mmr2, mmrw, smask)      "pavgb   "mmr2", "mmr1"\n\t" +#define V_PAVGB_TDNOW(mmr1, mmr2, mmrw, smask)    "pavgusb "mmr2", "mmr1"\n\t" +#define V_PAVGB(mmr1, mmr2, mmrw, smask)          V_PAVGB2(mmr1, mmr2, mmrw, smask, SIMD_TYPE)  +#define V_PAVGB2(mmr1, mmr2, mmrw, smask, simdtype) V_PAVGB3(mmr1, mmr2, mmrw, smask, simdtype)  +#define V_PAVGB3(mmr1, mmr2, mmrw, smask, simdtype) V_PAVGB_##simdtype(mmr1, mmr2, mmrw, smask)   // some macros for pmaxub instruction  #define V_PMAXUB_MMX(mmr1, mmr2) \      "psubusb "mmr2", "mmr1"\n\t" \      "paddusb "mmr2", "mmr1"\n\t" -#define V_PMAXUB_SSE(mmr1, mmr2)      "pmaxub "mmr2", "mmr1"\n\t" -#define V_PMAXUB_3DNOW(mmr1, mmr2)    V_PMAXUB_MMX(mmr1, mmr2)  // use MMX version -#define V_PMAXUB(mmr1, mmr2)          V_PMAXUB2(mmr1, mmr2, SSE_TYPE)  -#define V_PMAXUB2(mmr1, mmr2, ssetyp) V_PMAXUB3(mmr1, mmr2, ssetyp)  -#define V_PMAXUB3(mmr1, mmr2, ssetyp) V_PMAXUB_##ssetyp(mmr1, mmr2)  +#define V_PMAXUB_MMXEXT(mmr1, mmr2)      "pmaxub "mmr2", "mmr1"\n\t" +#define V_PMAXUB_TDNOW(mmr1, mmr2)    V_PMAXUB_MMX(mmr1, mmr2)  // use MMX version +#define V_PMAXUB(mmr1, mmr2)          V_PMAXUB2(mmr1, mmr2, SIMD_TYPE)  +#define V_PMAXUB2(mmr1, mmr2, simdtype) V_PMAXUB3(mmr1, mmr2, simdtype)  +#define V_PMAXUB3(mmr1, mmr2, simdtype) V_PMAXUB_##simdtype(mmr1, mmr2)   // some macros for pminub instruction  //      V_PMINUB(mmr1, mmr2, mmr work register)     mmr2 may NOT = mmrw @@ -56,19 +56,19 @@      "psubusb "mmr2", "mmrw"\n\t"       \      "paddusb "mmrw", "mmr1"\n\t"       \      "psubusb "mmrw", "mmr1"\n\t" -#define V_PMINUB_SSE(mmr1, mmr2, mmrw)      "pminub "mmr2", "mmr1"\n\t" -#define V_PMINUB_3DNOW(mmr1, mmr2, mmrw)    V_PMINUB_MMX(mmr1, mmr2, mmrw)  // use MMX version -#define V_PMINUB(mmr1, mmr2, mmrw)          V_PMINUB2(mmr1, mmr2, mmrw, SSE_TYPE)  -#define V_PMINUB2(mmr1, mmr2, mmrw, ssetyp) V_PMINUB3(mmr1, mmr2, mmrw, ssetyp)  -#define V_PMINUB3(mmr1, mmr2, mmrw, ssetyp) V_PMINUB_##ssetyp(mmr1, mmr2, mmrw)  +#define V_PMINUB_MMXEXT(mmr1, mmr2, mmrw)      "pminub "mmr2", "mmr1"\n\t" +#define V_PMINUB_TDNOW(mmr1, mmr2, mmrw)    V_PMINUB_MMX(mmr1, mmr2, mmrw)  // use MMX version +#define V_PMINUB(mmr1, mmr2, mmrw)          V_PMINUB2(mmr1, mmr2, mmrw, SIMD_TYPE)  +#define V_PMINUB2(mmr1, mmr2, mmrw, simdtype) V_PMINUB3(mmr1, mmr2, mmrw, simdtype)  +#define V_PMINUB3(mmr1, mmr2, mmrw, simdtype) V_PMINUB_##simdtype(mmr1, mmr2, mmrw)   // some macros for movntq instruction  //      V_MOVNTQ(mmr1, mmr2)   #define V_MOVNTQ_MMX(mmr1, mmr2)      "movq   "mmr2", "mmr1"\n\t" -#define V_MOVNTQ_3DNOW(mmr1, mmr2)    "movq   "mmr2", "mmr1"\n\t" -#define V_MOVNTQ_SSE(mmr1, mmr2)      "movntq "mmr2", "mmr1"\n\t" -#define V_MOVNTQ(mmr1, mmr2)          V_MOVNTQ2(mmr1, mmr2, SSE_TYPE)  -#define V_MOVNTQ2(mmr1, mmr2, ssetyp) V_MOVNTQ3(mmr1, mmr2, ssetyp)  -#define V_MOVNTQ3(mmr1, mmr2, ssetyp) V_MOVNTQ_##ssetyp(mmr1, mmr2) +#define V_MOVNTQ_TDNOW(mmr1, mmr2)    "movq   "mmr2", "mmr1"\n\t" +#define V_MOVNTQ_MMXEXT(mmr1, mmr2)      "movntq "mmr2", "mmr1"\n\t" +#define V_MOVNTQ(mmr1, mmr2)          V_MOVNTQ2(mmr1, mmr2, SIMD_TYPE)  +#define V_MOVNTQ2(mmr1, mmr2, simdtype) V_MOVNTQ3(mmr1, mmr2, simdtype)  +#define V_MOVNTQ3(mmr1, mmr2, simdtype) V_MOVNTQ_##simdtype(mmr1, mmr2)  // end of macros  | 
