diff options
Diffstat (limited to 'gst/deinterlace/tvtime')
33 files changed, 6147 insertions, 0 deletions
diff --git a/gst/deinterlace/tvtime/greedy.c b/gst/deinterlace/tvtime/greedy.c new file mode 100644 index 00000000..293d82fa --- /dev/null +++ b/gst/deinterlace/tvtime/greedy.c @@ -0,0 +1,488 @@ +/* + * + * GStreamer + * Copyright (c) 2000 Tom Barry All rights reserved. + * mmx.h port copyright (c) 2002 Billy Biggs <vektor@dumbterm.net>. + * + * Copyright (C) 2008 Sebastian Dröge <slomo@collabora.co.uk> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public + * License along with this library; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + */ + +/* + * Relicensed for GStreamer from GPL to LGPL with permit from Tom Barry + * and Billy Biggs. + * See: http://bugzilla.gnome.org/show_bug.cgi?id=163578 + */ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include "_stdint.h" + +#include "gstdeinterlace.h" +#include <string.h> + +#define GST_TYPE_DEINTERLACE_METHOD_GREEDY_L (gst_deinterlace_method_greedy_l_get_type ()) +#define GST_IS_DEINTERLACE_METHOD_GREEDY_L(obj) (G_TYPE_CHECK_INSTANCE_TYPE ((obj), GST_TYPE_DEINTERLACE_METHOD_GREEDY_L)) +#define GST_IS_DEINTERLACE_METHOD_GREEDY_L_CLASS(klass) (G_TYPE_CHECK_CLASS_TYPE ((klass), GST_TYPE_DEINTERLACE_METHOD_GREEDY_L)) +#define GST_DEINTERLACE_METHOD_GREEDY_L_GET_CLASS(obj) (G_TYPE_INSTANCE_GET_CLASS ((obj), GST_TYPE_DEINTERLACE_METHOD_GREEDY_L, GstDeinterlaceMethodGreedyLClass)) +#define GST_DEINTERLACE_METHOD_GREEDY_L(obj) (G_TYPE_CHECK_INSTANCE_CAST ((obj), GST_TYPE_DEINTERLACE_METHOD_GREEDY_L, GstDeinterlaceMethodGreedyL)) +#define GST_DEINTERLACE_METHOD_GREEDY_L_CLASS(klass) (G_TYPE_CHECK_CLASS_CAST ((klass), GST_TYPE_DEINTERLACE_METHOD_GREEDY_L, GstDeinterlaceMethodGreedyLClass)) +#define GST_DEINTERLACE_METHOD_GREEDY_L_CAST(obj) ((GstDeinterlaceMethodGreedyL*)(obj)) + +GType gst_deinterlace_method_greedy_l_get_type (void); + +typedef struct +{ + GstDeinterlaceMethod parent; + + guint max_comb; +} GstDeinterlaceMethodGreedyL; + +typedef struct +{ + GstDeinterlaceMethodClass parent_class; + void (*scanline) (GstDeinterlaceMethodGreedyL * self, uint8_t * L2, + uint8_t * L1, uint8_t * L3, uint8_t * L2P, uint8_t * Dest, int size); +} GstDeinterlaceMethodGreedyLClass; + +// This is a simple lightweight DeInterlace method that uses little CPU time +// but gives very good results for low or intermedite motion. +// It defers frames by one field, but that does not seem to produce noticeable +// lip sync problems. +// +// The method used is to take either the older or newer weave pixel depending +// upon which give the smaller comb factor, and then clip to avoid large damage +// when wrong. +// +// I'd intended this to be part of a larger more elaborate method added to +// Blended Clip but this give too good results for the CPU to ignore here. + +static inline void +deinterlace_greedy_packed422_scanline_c (GstDeinterlaceMethodGreedyL * self, + uint8_t * m0, uint8_t * t1, + uint8_t * b1, uint8_t * m2, uint8_t * output, int width) +{ + int avg, l2_diff, lp2_diff, max, min, best; + guint max_comb = self->max_comb; + + // L2 == m0 + // L1 == t1 + // L3 == b1 + // LP2 == m2 + + while (width--) { + avg = (*t1 + *b1) / 2; + + l2_diff = ABS (*m0 - avg); + lp2_diff = ABS (*m2 - avg); + + if (l2_diff > lp2_diff) + best = *m2; + else + best = *m0; + + max = MAX (*t1, *b1); + min = MIN (*t1, *b1); + + if (max < 256 - max_comb) + max += max_comb; + else + max = 255; + + if (min > max_comb) + min -= max_comb; + else + min = 0; + + *output = CLAMP (best, min, max); + + // Advance to the next set of pixels. + output += 1; + m0 += 1; + t1 += 1; + b1 += 1; + m2 += 1; + } +} + +#ifdef BUILD_X86_ASM +#include "mmx.h" +static void +deinterlace_greedy_packed422_scanline_mmx (GstDeinterlaceMethodGreedyL * self, + uint8_t * m0, uint8_t * t1, + uint8_t * b1, uint8_t * m2, uint8_t * output, int width) +{ + mmx_t MaxComb; + mmx_t ShiftMask; + + // How badly do we let it weave? 0-255 + MaxComb.ub[0] = self->max_comb; + MaxComb.ub[1] = self->max_comb; + MaxComb.ub[2] = self->max_comb; + MaxComb.ub[3] = self->max_comb; + MaxComb.ub[4] = self->max_comb; + MaxComb.ub[5] = self->max_comb; + MaxComb.ub[6] = self->max_comb; + MaxComb.ub[7] = self->max_comb; + + ShiftMask.ub[0] = 0x7f; + ShiftMask.ub[1] = 0x7f; + ShiftMask.ub[2] = 0x7f; + ShiftMask.ub[3] = 0x7f; + ShiftMask.ub[4] = 0x7f; + ShiftMask.ub[5] = 0x7f; + ShiftMask.ub[6] = 0x7f; + ShiftMask.ub[7] = 0x7f; + + // L2 == m0 + // L1 == t1 + // L3 == b1 + // LP2 == m2 + + movq_m2r (MaxComb, mm6); + + for (; width > 7; width -= 8) { + movq_m2r (*t1, mm1); // L1 + movq_m2r (*m0, mm2); // L2 + movq_m2r (*b1, mm3); // L3 + movq_m2r (*m2, mm0); // LP2 + + // average L1 and L3 leave result in mm4 + movq_r2r (mm1, mm4); // L1 + movq_r2r (mm3, mm5); // L3 + psrlw_i2r (1, mm4); // L1/2 + pand_m2r (ShiftMask, mm4); + psrlw_i2r (1, mm5); // L3/2 + pand_m2r (ShiftMask, mm5); + paddusb_r2r (mm5, mm4); // (L1 + L3) / 2 + + // get abs value of possible L2 comb + movq_r2r (mm2, mm7); // L2 + psubusb_r2r (mm4, mm7); // L2 - avg + movq_r2r (mm4, mm5); // avg + psubusb_r2r (mm2, mm5); // avg - L2 + por_r2r (mm7, mm5); // abs(avg-L2) + + // get abs value of possible LP2 comb + movq_r2r (mm0, mm7); // LP2 + psubusb_r2r (mm4, mm7); // LP2 - avg + psubusb_r2r (mm0, mm4); // avg - LP2 + por_r2r (mm7, mm4); // abs(avg-LP2) + + // use L2 or LP2 depending upon which makes smaller comb + psubusb_r2r (mm5, mm4); // see if it goes to zero + psubusb_r2r (mm5, mm5); // 0 + pcmpeqb_r2r (mm5, mm4); // if (mm4=0) then FF else 0 + pcmpeqb_r2r (mm4, mm5); // opposite of mm4 + + // if Comb(LP2) <= Comb(L2) then mm4=ff, mm5=0 else mm4=0, mm5 = 55 + pand_r2r (mm2, mm5); // use L2 if mm5 == ff, else 0 + pand_r2r (mm0, mm4); // use LP2 if mm4 = ff, else 0 + por_r2r (mm5, mm4); // may the best win + + // Now lets clip our chosen value to be not outside of the range + // of the high/low range L1-L3 by more than abs(L1-L3) + // This allows some comb but limits the damages and also allows more + // detail than a boring oversmoothed clip. + + movq_r2r (mm1, mm2); // copy L1 + psubusb_r2r (mm3, mm2); // - L3, with saturation + paddusb_r2r (mm3, mm2); // now = Max(L1,L3) + + pcmpeqb_r2r (mm7, mm7); // all ffffffff + psubusb_r2r (mm1, mm7); // - L1 + paddusb_r2r (mm7, mm3); // add, may sat at fff.. + psubusb_r2r (mm7, mm3); // now = Min(L1,L3) + + // allow the value to be above the high or below the low by amt of MaxComb + paddusb_r2r (mm6, mm2); // increase max by diff + psubusb_r2r (mm6, mm3); // lower min by diff + + psubusb_r2r (mm3, mm4); // best - Min + paddusb_r2r (mm3, mm4); // now = Max(best,Min(L1,L3) + + pcmpeqb_r2r (mm7, mm7); // all ffffffff + psubusb_r2r (mm4, mm7); // - Max(best,Min(best,L3) + paddusb_r2r (mm7, mm2); // add may sat at FFF.. + psubusb_r2r (mm7, mm2); // now = Min( Max(best, Min(L1,L3), L2 )=L2 clipped + + movq_r2m (mm2, *output); // move in our clipped best + + // Advance to the next set of pixels. + output += 8; + m0 += 8; + t1 += 8; + b1 += 8; + m2 += 8; + } + emms (); + if (width > 0) + deinterlace_greedy_packed422_scanline_c (self, m0, t1, b1, m2, output, + width); +} + +#include "sse.h" + +static void +deinterlace_greedy_packed422_scanline_mmxext (GstDeinterlaceMethodGreedyL * + self, uint8_t * m0, uint8_t * t1, uint8_t * b1, uint8_t * m2, + uint8_t * output, int width) +{ + mmx_t MaxComb; + + // How badly do we let it weave? 0-255 + MaxComb.ub[0] = self->max_comb; + MaxComb.ub[1] = self->max_comb; + MaxComb.ub[2] = self->max_comb; + MaxComb.ub[3] = self->max_comb; + MaxComb.ub[4] = self->max_comb; + MaxComb.ub[5] = self->max_comb; + MaxComb.ub[6] = self->max_comb; + MaxComb.ub[7] = self->max_comb; + + // L2 == m0 + // L1 == t1 + // L3 == b1 + // LP2 == m2 + + movq_m2r (MaxComb, mm6); + + for (; width > 7; width -= 8) { + movq_m2r (*t1, mm1); // L1 + movq_m2r (*m0, mm2); // L2 + movq_m2r (*b1, mm3); // L3 + movq_m2r (*m2, mm0); // LP2 + + // average L1 and L3 leave result in mm4 + movq_r2r (mm1, mm4); // L1 + pavgb_r2r (mm3, mm4); // (L1 + L3)/2 + + // get abs value of possible L2 comb + movq_r2r (mm2, mm7); // L2 + psubusb_r2r (mm4, mm7); // L2 - avg + movq_r2r (mm4, mm5); // avg + psubusb_r2r (mm2, mm5); // avg - L2 + por_r2r (mm7, mm5); // abs(avg-L2) + + // get abs value of possible LP2 comb + movq_r2r (mm0, mm7); // LP2 + psubusb_r2r (mm4, mm7); // LP2 - avg + psubusb_r2r (mm0, mm4); // avg - LP2 + por_r2r (mm7, mm4); // abs(avg-LP2) + + // use L2 or LP2 depending upon which makes smaller comb + psubusb_r2r (mm5, mm4); // see if it goes to zero + pxor_r2r (mm5, mm5); // 0 + pcmpeqb_r2r (mm5, mm4); // if (mm4=0) then FF else 0 + pcmpeqb_r2r (mm4, mm5); // opposite of mm4 + + // if Comb(LP2) <= Comb(L2) then mm4=ff, mm5=0 else mm4=0, mm5 = 55 + pand_r2r (mm2, mm5); // use L2 if mm5 == ff, else 0 + pand_r2r (mm0, mm4); // use LP2 if mm4 = ff, else 0 + por_r2r (mm5, mm4); // may the best win + + // Now lets clip our chosen value to be not outside of the range + // of the high/low range L1-L3 by more than abs(L1-L3) + // This allows some comb but limits the damages and also allows more + // detail than a boring oversmoothed clip. + + movq_r2r (mm1, mm2); // copy L1 + pmaxub_r2r (mm3, mm2); // now = Max(L1,L3) + + pminub_r2r (mm1, mm3); // now = Min(L1,L3) + + // allow the value to be above the high or below the low by amt of MaxComb + paddusb_r2r (mm6, mm2); // increase max by diff + psubusb_r2r (mm6, mm3); // lower min by diff + + + pmaxub_r2r (mm3, mm4); // now = Max(best,Min(L1,L3) + pminub_r2r (mm4, mm2); // now = Min( Max(best, Min(L1,L3)), L2 )=L2 clipped + + movq_r2m (mm2, *output); // move in our clipped best + + // Advance to the next set of pixels. + output += 8; + m0 += 8; + t1 += 8; + b1 += 8; + m2 += 8; + } + emms (); + + if (width > 0) + deinterlace_greedy_packed422_scanline_c (self, m0, t1, b1, m2, output, + width); +} + +#endif + +static void +deinterlace_frame_di_greedy (GstDeinterlaceMethod * d_method, + GstDeinterlace * object, GstBuffer * outbuf) +{ + GstDeinterlaceMethodGreedyL *self = + GST_DEINTERLACE_METHOD_GREEDY_L (d_method); + GstDeinterlaceMethodGreedyLClass *klass = + GST_DEINTERLACE_METHOD_GREEDY_L_GET_CLASS (self); + int InfoIsOdd = 0; + int Line; + unsigned int Pitch = object->field_stride; + unsigned char *L1; // ptr to Line1, of 3 + unsigned char *L2; // ptr to Line2, the weave line + unsigned char *L3; // ptr to Line3 + + unsigned char *L2P; // ptr to prev Line2 + unsigned char *Dest = GST_BUFFER_DATA (outbuf); + + // copy first even line no matter what, and the first odd line if we're + // processing an EVEN field. (note diff from other deint rtns.) + + if (object->field_history[object->history_count - 1].flags == + PICTURE_INTERLACED_BOTTOM) { + InfoIsOdd = 1; + + L1 = GST_BUFFER_DATA (object->field_history[object->history_count - 2].buf); + L2 = GST_BUFFER_DATA (object->field_history[object->history_count - 1].buf); + L3 = L1 + Pitch; + L2P = + GST_BUFFER_DATA (object->field_history[object->history_count - 3].buf); + + // copy first even line + oil_memcpy (Dest, L1, object->row_stride); + Dest += object->row_stride; + } else { + InfoIsOdd = 0; + L1 = GST_BUFFER_DATA (object->field_history[object->history_count - 2].buf); + L2 = GST_BUFFER_DATA (object->field_history[object->history_count - + 1].buf) + Pitch; + L3 = L1 + Pitch; + L2P = + GST_BUFFER_DATA (object->field_history[object->history_count - 3].buf) + + Pitch; + + // copy first even line + oil_memcpy (Dest, GST_BUFFER_DATA (object->field_history[0].buf), + object->row_stride); + Dest += object->row_stride; + // then first odd line + oil_memcpy (Dest, L1, object->row_stride); + Dest += object->row_stride; + } + + for (Line = 0; Line < (object->field_height - 1); ++Line) { + klass->scanline (self, L2, L1, L3, L2P, Dest, object->row_stride); + Dest += object->row_stride; + oil_memcpy (Dest, L3, object->row_stride); + Dest += object->row_stride; + + L1 += Pitch; + L2 += Pitch; + L3 += Pitch; + L2P += Pitch; + } + + if (InfoIsOdd) { + oil_memcpy (Dest, L2, object->row_stride); + } +} + + +G_DEFINE_TYPE (GstDeinterlaceMethodGreedyL, gst_deinterlace_method_greedy_l, + GST_TYPE_DEINTERLACE_METHOD); + +enum +{ + ARG_0, + ARG_MAX_COMB +}; + +static void +gst_deinterlace_method_greedy_l_set_property (GObject * object, guint prop_id, + const GValue * value, GParamSpec * pspec) +{ + GstDeinterlaceMethodGreedyL *self = GST_DEINTERLACE_METHOD_GREEDY_L (object); + + switch (prop_id) { + case ARG_MAX_COMB: + self->max_comb = g_value_get_uint (value); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID (object, prop_id, pspec); + } +} + +static void +gst_deinterlace_method_greedy_l_get_property (GObject * object, guint prop_id, + GValue * value, GParamSpec * pspec) +{ + GstDeinterlaceMethodGreedyL *self = GST_DEINTERLACE_METHOD_GREEDY_L (object); + + switch (prop_id) { + case ARG_MAX_COMB: + g_value_set_uint (value, self->max_comb); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID (object, prop_id, pspec); + } +} + +static void +gst_deinterlace_method_greedy_l_class_init (GstDeinterlaceMethodGreedyLClass * + klass) +{ + GstDeinterlaceMethodClass *dim_class = (GstDeinterlaceMethodClass *) klass; + GObjectClass *gobject_class = (GObjectClass *) klass; +#ifdef BUILD_X86_ASM + guint cpu_flags = oil_cpu_get_flags (); +#endif + + gobject_class->set_property = gst_deinterlace_method_greedy_l_set_property; + gobject_class->get_property = gst_deinterlace_method_greedy_l_get_property; + + g_object_class_install_property (gobject_class, ARG_MAX_COMB, + g_param_spec_uint ("max-comb", + "Max comb", + "Max Comb", 0, 255, 15, G_PARAM_READWRITE | G_PARAM_STATIC_STRINGS) + ); + + dim_class->fields_required = 4; + dim_class->deinterlace_frame = deinterlace_frame_di_greedy; + dim_class->name = "Motion Adaptive: Simple Detection"; + dim_class->nick = "greedyl"; + dim_class->latency = 1; + +#ifdef BUILD_X86_ASM + if (cpu_flags & OIL_IMPL_FLAG_MMXEXT) { + klass->scanline = deinterlace_greedy_packed422_scanline_mmxext; + } else if (cpu_flags & OIL_IMPL_FLAG_MMX) { + klass->scanline = deinterlace_greedy_packed422_scanline_mmx; + } else { + klass->scanline = deinterlace_greedy_packed422_scanline_c; + } +#else + klass->scanline = deinterlace_greedy_packed422_scanline_c; +#endif +} + +static void +gst_deinterlace_method_greedy_l_init (GstDeinterlaceMethodGreedyL * self) +{ + self->max_comb = 15; +} diff --git a/gst/deinterlace/tvtime/greedyh.asm b/gst/deinterlace/tvtime/greedyh.asm new file mode 100644 index 00000000..86e97c58 --- /dev/null +++ b/gst/deinterlace/tvtime/greedyh.asm @@ -0,0 +1,250 @@ +/* + * + * GStreamer + * Copyright (c) 2001 Tom Barry. All rights reserved. + * Copyright (C) 2008 Sebastian Dröge <slomo@collabora.co.uk> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public + * License along with this library; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + */ + + +/* + * Relicensed for GStreamer from GPL to LGPL with permit from Tom Barry. + * See: http://bugzilla.gnome.org/show_bug.cgi?id=163578 + */ + + +#include "x86-64_macros.inc" + +void +FUNCT_NAME (GstDeinterlaceMethodGreedyH *self, uint8_t * L1, uint8_t * L2, uint8_t * L3, uint8_t * L2P, + uint8_t * Dest, int size) +{ + + // in tight loop some vars are accessed faster in local storage + int64_t YMask = 0x00ff00ff00ff00ffull; // to keep only luma + int64_t UVMask = 0xff00ff00ff00ff00ull; // to keep only chroma + int64_t ShiftMask = 0xfefefefefefefefeull; // to avoid shifting chroma to luma + int64_t QW256 = 0x0100010001000100ull; // 4 256's + int64_t MaxComb; + int64_t MotionThreshold; + int64_t MotionSense; + int64_t i; + long LoopCtr; + long oldbx; + + int64_t QW256B; + int64_t LastAvg = 0; //interp value from left qword + + // FIXME: Use C implementation if the width is not a multiple of 4 + // Do something more optimal later + if (size % 8 != 0) + greedyDScaler_C (self, L1, L2, L3, L2P, Dest, size); + + // Set up our two parms that are actually evaluated for each pixel + i = self->max_comb; + MaxComb = + i << 56 | i << 48 | i << 40 | i << 32 | i << 24 | i << 16 | i << 8 | i; + + i = self->motion_threshold; // scale to range of 0-257 + MotionThreshold = i << 48 | i << 32 | i << 16 | i | UVMask; + + i = self->motion_sense; // scale to range of 0-257 + MotionSense = i << 48 | i << 32 | i << 16 | i; + + i = 0xffffffff - 256; + QW256B = i << 48 | i << 32 | i << 16 | i; // save a couple instr on PMINSW instruct. + + LoopCtr = size / 8 - 1; // there are LineLength / 8 qwords per line but do 1 less, adj at end of loop + + // For ease of reading, the comments below assume that we're operating on an odd + // field (i.e., that InfoIsOdd is true). Assume the obvious for even lines.. + __asm__ __volatile__ ( + // save ebx (-fPIC) + MOVX " %%" XBX ", %[oldbx]\n\t" + MOVX " %[L1], %%" XAX "\n\t" + LEAX " 8(%%" XAX "), %%" XBX "\n\t" // next qword needed by DJR + MOVX " %[L3], %%" XCX "\n\t" + SUBX " %%" XAX ", %%" XCX "\n\t" // carry L3 addr as an offset + MOVX " %[L2P], %%" XDX "\n\t" + MOVX " %[L2], %%" XSI "\n\t" + MOVX " %[Dest], %%" XDI "\n\t" // DL1 if Odd or DL2 if Even + + ".align 8\n\t" + "1:\n\t" + "movq (%%" XSI "), %%mm0\n\t" // L2 - the newest weave pixel value + "movq (%%" XAX "), %%mm1\n\t" // L1 - the top pixel + "movq (%%" XDX "), %%mm2\n\t" // L2P - the prev weave pixel + "movq (%%" XAX ", %%" XCX "), %%mm3\n\t" // L3, next odd row + "movq %%mm1, %%mm6\n\t" // L1 - get simple single pixel interp + + // pavgb mm6, mm3 // use macro below + V_PAVGB ("%%mm6", "%%mm3", "%%mm4", "%[ShiftMask]") + + // DJR - Diagonal Jaggie Reduction + // In the event that we are going to use an average (Bob) pixel we do not want a jagged + // stair step effect. To combat this we avg in the 2 horizontally adjacen pixels into the + // interpolated Bob mix. This will do horizontal smoothing for only the Bob'd pixels. + + "movq %[LastAvg], %%mm4\n\t" // the bob value from prev qword in row + "movq %%mm6, %[LastAvg]\n\t" // save for next pass + "psrlq $48, %%mm4\n\t" // right justify 1 pixel + "movq %%mm6, %%mm7\n\t" // copy of simple bob pixel + "psllq $16, %%mm7\n\t" // left justify 3 pixels + "por %%mm7, %%mm4\n\t" // and combine + "movq (%%" XBX "), %%mm5\n\t" // next horiz qword from L1 + // pavgb mm5, qword ptr[ebx+ecx] // next horiz qword from L3, use macro below + + V_PAVGB ("%%mm5", "(%%" XBX ",%%" XCX ")", "%%mm7", "%[ShiftMask]") + "psllq $48, %%mm5\n\t" // left just 1 pixel + "movq %%mm6, %%mm7\n\t" // another copy of simple bob pixel + "psrlq $16, %%mm7\n\t" // right just 3 pixels + "por %%mm7, %%mm5\n\t" // combine + // pavgb mm4, mm5 // avg of forward and prev by 1 pixel, use macro + V_PAVGB ("%%mm4", "%%mm5", "%%mm5", "%[ShiftMask]") // mm5 gets modified if MMX + // pavgb mm6, mm4 // avg of center and surround interp vals, use macro + V_PAVGB ("%%mm6", "%%mm4", "%%mm7", "%[ShiftMask]") + + // Don't do any more averaging than needed for mmx. It hurts performance and causes rounding errors. +#ifndef IS_MMX + // pavgb mm4, mm6 // 1/4 center, 3/4 adjacent + V_PAVGB ("%%mm4", "%%mm6", "%%mm7", "%[ShiftMask]") + // pavgb mm6, mm4 // 3/8 center, 5/8 adjacent + V_PAVGB ("%%mm6", "%%mm4", "%%mm7", "%[ShiftMask]") +#endif + + // get abs value of possible L2 comb + "movq %%mm6, %%mm4\n\t" // work copy of interp val + "movq %%mm2, %%mm7\n\t" // L2 + "psubusb %%mm4, %%mm7\n\t" // L2 - avg + "movq %%mm4, %%mm5\n\t" // avg + "psubusb %%mm2, %%mm5\n\t" // avg - L2 + "por %%mm7, %%mm5\n\t" // abs(avg-L2) + + // get abs value of possible L2P comb + "movq %%mm0, %%mm7\n\t" // L2P + "psubusb %%mm4, %%mm7\n\t" // L2P - avg + "psubusb %%mm0, %%mm4\n\t" // avg - L2P + "por %%mm7, %%mm4\n\t" // abs(avg-L2P) + + // use L2 or L2P depending upon which makes smaller comb + "psubusb %%mm5, %%mm4\n\t" // see if it goes to zero + "psubusb %%mm5, %%mm5\n\t" // 0 + "pcmpeqb %%mm5, %%mm4\n\t" // if (mm4=0) then FF else 0 + "pcmpeqb %%mm4, %%mm5\n\t" // opposite of mm4 + + // if Comb(L2P) <= Comb(L2) then mm4=ff, mm5=0 else mm4=0, mm5 = 55 + "pand %%mm2, %%mm5\n\t" // use L2 if mm5 == ff, else 0 + "pand %%mm0, %%mm4\n\t" // use L2P if mm4 = ff, else 0 + "por %%mm5, %%mm4\n\t" // may the best win + + // Inventory: at this point we have the following values: + // mm0 = L2P (or L2) + // mm1 = L1 + // mm2 = L2 (or L2P) + // mm3 = L3 + // mm4 = the best of L2,L2P weave pixel, base upon comb + // mm6 = the avg interpolated value, if we need to use it + // Let's measure movement, as how much the weave pixel has changed + + "movq %%mm2, %%mm7\n\t" + "psubusb %%mm0, %%mm2\n\t" + "psubusb %%mm7, %%mm0\n\t" + "por %%mm2, %%mm0\n\t" // abs value of change, used later + + // Now lets clip our chosen value to be not outside of the range + // of the high/low range L1-L3 by more than MaxComb. + // This allows some comb but limits the damages and also allows more + // detail than a boring oversmoothed clip. + + "movq %%mm1, %%mm2\n\t" // copy L1 + // pmaxub mm2, mm3 // use macro + V_PMAXUB ("%%mm2", "%%mm3") // now = Max(L1,L3) + "movq %%mm1, %%mm5\n\t" // copy L1 + // pminub mm5, mm3 // now = Min(L1,L3), use macro + V_PMINUB ("%%mm5", "%%mm3", "%%mm7") + + // allow the value to be above the high or below the low by amt of MaxComb + "psubusb %[MaxComb], %%mm5\n\t" // lower min by diff + "paddusb %[MaxComb], %%mm2\n\t" // increase max by diff + // pmaxub mm4, mm5 // now = Max(best,Min(L1,L3) use macro + V_PMAXUB ("%%mm4", "%%mm5") + // pminub mm4, mm2 // now = Min( Max(best, Min(L1,L3), L2 )=L2 clipped + V_PMINUB ("%%mm4", "%%mm2", "%%mm7") + + // Blend weave pixel with bob pixel, depending on motion val in mm0 + "psubusb %[MotionThreshold], %%mm0\n\t" // test Threshold, clear chroma change >>>?? + "pmullw %[MotionSense], %%mm0\n\t" // mul by user factor, keep low 16 bits + "movq %[QW256], %%mm7\n\t" +#ifdef IS_MMXEXT + "pminsw %%mm7, %%mm0\n\t" // max = 256 +#else + "paddusw %[QW256B], %%mm0\n\t" // add, may sat at fff.. + "psubusw %[QW256B], %%mm0\n\t" // now = Min(L1,256) +#endif + "psubusw %%mm0, %%mm7\n\t" // so the 2 sum to 256, weighted avg + "movq %%mm4, %%mm2\n\t" // save weave chroma info before trashing + "pand %[YMask], %%mm4\n\t" // keep only luma from calc'd value + "pmullw %%mm7, %%mm4\n\t" // use more weave for less motion + "pand %[YMask], %%mm6\n\t" // keep only luma from calc'd value + "pmullw %%mm0, %%mm6\n\t" // use more bob for large motion + "paddusw %%mm6, %%mm4\n\t" // combine + "psrlw $8, %%mm4\n\t" // div by 256 to get weighted avg + // chroma comes from weave pixel + "pand %[UVMask], %%mm2\n\t" // keep chroma + "por %%mm4, %%mm2\n\t" // and combine + V_MOVNTQ ("(%%" XDI ")", "%%mm2") // move in our clipped best, use macro + // bump ptrs and loop + LEAX " 8(%%" XAX "), %%" XAX "\n\t" + LEAX " 8(%%" XBX "), %%" XBX "\n\t" + LEAX " 8(%%" XDX "), %%" XDX "\n\t" + LEAX " 8(%%" XDI "), %%" XDI "\n\t" + LEAX " 8(%%" XSI "), %%" XSI "\n\t" + DECX " %[LoopCtr]\n\t" + + "jg 1b\n\t" // loop if not to last line + // note P-III default assumes backward branches taken + "jl 1f\n\t" // done + MOVX " %%" XAX ", %%" XBX "\n\t" // sharpness lookahead 1 byte only, be wrong on 1 + "jmp 1b\n\t" + + "1:\n\t" + MOVX " %[oldbx], %%" XBX "\n\t" + "emms\n\t": /* no outputs */ + + :[LastAvg] "m" (LastAvg), + [L1] "m" (L1), + [L3] "m" (L3), + [L2P] "m" (L2P), + [L2] "m" (L2), + [Dest] "m" (Dest), + [ShiftMask] "m" (ShiftMask), + [MaxComb] "m" (MaxComb), + [MotionThreshold] "m" (MotionThreshold), + [MotionSense] "m" (MotionSense), + [QW256B] "m" (QW256B), + [YMask] "m" (YMask), + [UVMask] "m" (UVMask), + [LoopCtr] "m" (LoopCtr), + [QW256] "m" (QW256), + [oldbx] "m" (oldbx) + : XAX, XCX, XDX, XSI, XDI, + "st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)", +#ifdef __MMX__ + "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7", +#endif + "memory", "cc"); +} diff --git a/gst/deinterlace/tvtime/greedyh.c b/gst/deinterlace/tvtime/greedyh.c new file mode 100644 index 00000000..5d050ce0 --- /dev/null +++ b/gst/deinterlace/tvtime/greedyh.c @@ -0,0 +1,420 @@ +/* + * + * GStreamer + * Copyright (C) 2004 Billy Biggs <vektor@dumbterm.net> + * Copyright (C) 2008 Sebastian Dröge <slomo@collabora.co.uk> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public + * License along with this library; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + */ + +/* + * Relicensed for GStreamer from GPL to LGPL with permit from Billy Biggs. + * See: http://bugzilla.gnome.org/show_bug.cgi?id=163578 + */ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include "greedyhmacros.h" + +#include <stdlib.h> +#include "_stdint.h" +#include <string.h> + +#include "gst/gst.h" +#include "plugins.h" +#include "gstdeinterlace.h" + +#define GST_TYPE_DEINTERLACE_METHOD_GREEDY_H (gst_deinterlace_method_greedy_h_get_type ()) +#define GST_IS_DEINTERLACE_METHOD_GREEDY_H(obj) (G_TYPE_CHECK_INSTANCE_TYPE ((obj), GST_TYPE_DEINTERLACE_METHOD_GREEDY_H)) +#define GST_IS_DEINTERLACE_METHOD_GREEDY_H_CLASS(klass) (G_TYPE_CHECK_CLASS_TYPE ((klass), GST_TYPE_DEINTERLACE_METHOD_GREEDY_H)) +#define GST_DEINTERLACE_METHOD_GREEDY_H_GET_CLASS(obj) (G_TYPE_INSTANCE_GET_CLASS ((obj), GST_TYPE_DEINTERLACE_METHOD_GREEDY_H, GstDeinterlaceMethodGreedyHClass)) +#define GST_DEINTERLACE_METHOD_GREEDY_H(obj) (G_TYPE_CHECK_INSTANCE_CAST ((obj), GST_TYPE_DEINTERLACE_METHOD_GREEDY_H, GstDeinterlaceMethodGreedyH)) +#define GST_DEINTERLACE_METHOD_GREEDY_H_CLASS(klass) (G_TYPE_CHECK_CLASS_CAST ((klass), GST_TYPE_DEINTERLACE_METHOD_GREEDY_H, GstDeinterlaceMethodGreedyHClass)) +#define GST_DEINTERLACE_METHOD_GREEDY_H_CAST(obj) ((GstDeinterlaceMethodGreedyH*)(obj)) + +GType gst_deinterlace_method_greedy_h_get_type (void); + +typedef struct +{ + GstDeinterlaceMethod parent; + + guint max_comb, motion_threshold, motion_sense; +} GstDeinterlaceMethodGreedyH; + +typedef struct +{ + GstDeinterlaceMethodClass parent_class; + void (*scanline) (GstDeinterlaceMethodGreedyH * self, uint8_t * L2, + uint8_t * L1, uint8_t * L3, uint8_t * L2P, uint8_t * Dest, int size); +} GstDeinterlaceMethodGreedyHClass; + +void +greedyDScaler_C (GstDeinterlaceMethodGreedyH * self, uint8_t * L1, uint8_t * L2, + uint8_t * L3, uint8_t * L2P, uint8_t * Dest, int size) +{ + int Pos; + uint8_t l1_l, l1_1_l, l3_l, l3_1_l; + uint8_t l1_c, l1_1_c, l3_c, l3_1_c; + uint8_t avg_l, avg_c, avg_l_1, avg_c_1; + uint8_t avg_l__1 = 0, avg_c__1 = 0; + uint8_t avg_s_l, avg_s_c; + uint8_t avg_sc_l, avg_sc_c; + uint8_t best_l, best_c; + uint16_t mov_l; + uint8_t out_l, out_c; + uint8_t l2_l, l2_c, lp2_l, lp2_c; + uint8_t l2_l_diff, l2_c_diff, lp2_l_diff, lp2_c_diff; + uint8_t min_l, min_c, max_l, max_c; + guint max_comb = self->max_comb; + guint motion_sense = self->motion_sense; + guint motion_threshold = self->motion_threshold; + + for (Pos = 0; Pos < size; Pos += 2) { + l1_l = L1[0]; + l1_c = L1[1]; + l3_l = L3[0]; + l3_c = L3[1]; + + if (Pos == size - 1) { + l1_1_l = l1_l; + l1_1_c = l1_c; + l3_1_l = l3_l; + l3_1_c = l3_c; + } else { + l1_1_l = L1[2]; + l1_1_c = L1[3]; + l3_1_l = L3[2]; + l3_1_c = L3[3]; + } + + /* Average of L1 and L3 */ + avg_l = (l1_l + l3_l) / 2; + avg_c = (l1_c + l3_c) / 2; + + if (Pos == 0) { + avg_l__1 = avg_l; + avg_c__1 = avg_c; + } + + /* Average of next L1 and next L3 */ + avg_l_1 = (l1_1_l + l3_1_l) / 2; + avg_c_1 = (l1_1_c + l3_1_c) / 2; + + /* Calculate average of one pixel forward and previous */ + avg_s_l = (avg_l__1 + avg_l_1) / 2; + avg_s_c = (avg_c__1 + avg_c_1) / 2; + + /* Calculate average of center and surrounding pixels */ + avg_sc_l = (avg_l + avg_s_l) / 2; + avg_sc_c = (avg_c + avg_s_c) / 2; + + /* move forward */ + avg_l__1 = avg_l; + avg_c__1 = avg_c; + + /* Get best L2/L2P, i.e. least diff from above average */ + l2_l = L2[0]; + l2_c = L2[1]; + lp2_l = L2P[0]; + lp2_c = L2P[1]; + + l2_l_diff = ABS (l2_l - avg_sc_l); + l2_c_diff = ABS (l2_c - avg_sc_c); + + lp2_l_diff = ABS (lp2_l - avg_sc_l); + lp2_c_diff = ABS (lp2_c - avg_sc_c); + + if (l2_l_diff > lp2_l_diff) + best_l = lp2_l; + else + best_l = l2_l; + + if (l2_c_diff > lp2_c_diff) + best_c = lp2_c; + else + best_c = l2_c; + + /* Clip this best L2/L2P by L1/L3 and allow to differ by GreedyMaxComb */ + max_l = MAX (l1_l, l3_l); + min_l = MIN (l1_l, l3_l); + + if (max_l < 256 - max_comb) + max_l += max_comb; + else + max_l = 255; + + if (min_l > max_comb) + min_l -= max_comb; + else + min_l = 0; + + max_c = MAX (l1_c, l3_c); + min_c = MIN (l1_c, l3_c); + + if (max_c < 256 - max_comb) + max_c += max_comb; + else + max_c = 255; + + if (min_c > max_comb) + min_c -= max_comb; + else + min_c = 0; + + out_l = CLAMP (best_l, min_l, max_l); + out_c = CLAMP (best_c, min_c, max_c); + + /* Do motion compensation for luma, i.e. how much + * the weave pixel differs */ + mov_l = ABS (l2_l - lp2_l); + if (mov_l > motion_threshold) + mov_l -= motion_threshold; + else + mov_l = 0; + + mov_l = mov_l * motion_sense; + if (mov_l > 256) + mov_l = 256; + + /* Weighted sum on clipped weave pixel and average */ + out_l = (out_l * (256 - mov_l) + avg_sc_l * mov_l) / 256; + + Dest[0] = out_l; + Dest[1] = out_c; + + Dest += 2; + L1 += 2; + L2 += 2; + L3 += 2; + L2P += 2; + } +} + +#ifdef BUILD_X86_ASM + +#define IS_MMXEXT +#define SIMD_TYPE MMXEXT +#define FUNCT_NAME greedyDScaler_MMXEXT +#include "greedyh.asm" +#undef SIMD_TYPE +#undef IS_MMXEXT +#undef FUNCT_NAME + +#define IS_3DNOW +#define SIMD_TYPE 3DNOW +#define FUNCT_NAME greedyDScaler_3DNOW +#include "greedyh.asm" +#undef SIMD_TYPE +#undef IS_3DNOW +#undef FUNCT_NAME + +#define IS_MMX +#define SIMD_TYPE MMX +#define FUNCT_NAME greedyDScaler_MMX +#include "greedyh.asm" +#undef SIMD_TYPE +#undef IS_MMX +#undef FUNCT_NAME + +#endif + +static void +deinterlace_frame_di_greedyh (GstDeinterlaceMethod * d_method, + GstDeinterlace * object, GstBuffer * outbuf) +{ + GstDeinterlaceMethodGreedyH *self = + GST_DEINTERLACE_METHOD_GREEDY_H (d_method); + GstDeinterlaceMethodGreedyHClass *klass = + GST_DEINTERLACE_METHOD_GREEDY_H_GET_CLASS (self); + int InfoIsOdd = 0; + int Line; + unsigned int Pitch = object->field_stride; + + unsigned char *L1; // ptr to Line1, of 3 + unsigned char *L2; // ptr to Line2, the weave line + unsigned char *L3; // ptr to Line3 + + unsigned char *L2P; // ptr to prev Line2 + unsigned char *Dest = GST_BUFFER_DATA (outbuf); + + // copy first even line no matter what, and the first odd line if we're + // processing an EVEN field. (note diff from other deint rtns.) + + if (object->field_history[object->history_count - 1].flags == + PICTURE_INTERLACED_BOTTOM) { + InfoIsOdd = 1; + + L1 = GST_BUFFER_DATA (object->field_history[object->history_count - 2].buf); + L2 = GST_BUFFER_DATA (object->field_history[object->history_count - 1].buf); + L3 = L1 + Pitch; + L2P = + GST_BUFFER_DATA (object->field_history[object->history_count - 3].buf); + + // copy first even line + oil_memcpy (Dest, L1, object->row_stride); + Dest += object->row_stride; + } else { + InfoIsOdd = 0; + L1 = GST_BUFFER_DATA (object->field_history[object->history_count - 2].buf); + L2 = GST_BUFFER_DATA (object->field_history[object->history_count - + 1].buf) + Pitch; + L3 = L1 + Pitch; + L2P = + GST_BUFFER_DATA (object->field_history[object->history_count - 3].buf) + + Pitch; + + // copy first even line + oil_memcpy (Dest, GST_BUFFER_DATA (object->field_history[0].buf), + object->row_stride); + Dest += object->row_stride; + // then first odd line + oil_memcpy (Dest, L1, object->row_stride); + Dest += object->row_stride; + } + + for (Line = 0; Line < (object->field_height - 1); ++Line) { + klass->scanline (self, L1, L2, L3, L2P, Dest, object->row_stride); + Dest += object->row_stride; + oil_memcpy (Dest, L3, object->row_stride); + Dest += object->row_stride; + + L1 += Pitch; + L2 += Pitch; + L3 += Pitch; + L2P += Pitch; + } + + if (InfoIsOdd) { + oil_memcpy (Dest, L2, object->row_stride); + } +} + +G_DEFINE_TYPE (GstDeinterlaceMethodGreedyH, gst_deinterlace_method_greedy_h, + GST_TYPE_DEINTERLACE_METHOD); + +enum +{ + ARG_0, + ARG_MAX_COMB, + ARG_MOTION_THRESHOLD, + ARG_MOTION_SENSE +}; + +static void +gst_deinterlace_method_greedy_h_set_property (GObject * object, guint prop_id, + const GValue * value, GParamSpec * pspec) +{ + GstDeinterlaceMethodGreedyH *self = GST_DEINTERLACE_METHOD_GREEDY_H (object); + + switch (prop_id) { + case ARG_MAX_COMB: + self->max_comb = g_value_get_uint (value); + break; + case ARG_MOTION_THRESHOLD: + self->motion_threshold = g_value_get_uint (value); + break; + case ARG_MOTION_SENSE: + self->motion_sense = g_value_get_uint (value); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID (object, prop_id, pspec); + } +} + +static void +gst_deinterlace_method_greedy_h_get_property (GObject * object, guint prop_id, + GValue * value, GParamSpec * pspec) +{ + GstDeinterlaceMethodGreedyH *self = GST_DEINTERLACE_METHOD_GREEDY_H (object); + + switch (prop_id) { + case ARG_MAX_COMB: + g_value_set_uint (value, self->max_comb); + break; + case ARG_MOTION_THRESHOLD: + g_value_set_uint (value, self->motion_threshold); + break; + case ARG_MOTION_SENSE: + g_value_set_uint (value, self->motion_sense); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID (object, prop_id, pspec); + } +} + +static void +gst_deinterlace_method_greedy_h_class_init (GstDeinterlaceMethodGreedyHClass * + klass) +{ + GstDeinterlaceMethodClass *dim_class = (GstDeinterlaceMethodClass *) klass; + GObjectClass *gobject_class = (GObjectClass *) klass; +#ifdef BUILD_X86_ASM + guint cpu_flags = oil_cpu_get_flags (); +#endif + + gobject_class->set_property = gst_deinterlace_method_greedy_h_set_property; + gobject_class->get_property = gst_deinterlace_method_greedy_h_get_property; + + g_object_class_install_property (gobject_class, ARG_MAX_COMB, + g_param_spec_uint ("max-comb", + "Max comb", + "Max Comb", 0, 255, 5, G_PARAM_READWRITE | G_PARAM_STATIC_STRINGS) + ); + + g_object_class_install_property (gobject_class, ARG_MOTION_THRESHOLD, + g_param_spec_uint ("motion-threshold", + "Motion Threshold", + "Motion Threshold", + 0, 255, 25, G_PARAM_READWRITE | G_PARAM_STATIC_STRINGS) + ); + + g_object_class_install_property (gobject_class, ARG_MOTION_SENSE, + g_param_spec_uint ("motion-sense", + "Motion Sense", + "Motion Sense", + 0, 255, 30, G_PARAM_READWRITE | G_PARAM_STATIC_STRINGS) + ); + + dim_class->fields_required = 4; + dim_class->deinterlace_frame = deinterlace_frame_di_greedyh; + dim_class->name = "Motion Adaptive: Advanced Detection"; + dim_class->nick = "greedyh"; + dim_class->latency = 1; + +#ifdef BUILD_X86_ASM + if (cpu_flags & OIL_IMPL_FLAG_MMXEXT) { + klass->scanline = greedyDScaler_MMXEXT; + } else if (cpu_flags & OIL_IMPL_FLAG_3DNOW) { + klass->scanline = greedyDScaler_3DNOW; + } else if (cpu_flags & OIL_IMPL_FLAG_MMX) { + klass->scanline = greedyDScaler_MMX; + } else { + klass->scanline = greedyDScaler_C; + } +#else + klass->scanline = greedyDScaler_C; +#endif +} + +static void +gst_deinterlace_method_greedy_h_init (GstDeinterlaceMethodGreedyH * self) +{ + self->max_comb = 5; + self->motion_threshold = 25; + self->motion_sense = 30; +} diff --git a/gst/deinterlace/tvtime/greedyhmacros.h b/gst/deinterlace/tvtime/greedyhmacros.h new file mode 100644 index 00000000..0386c28e --- /dev/null +++ b/gst/deinterlace/tvtime/greedyhmacros.h @@ -0,0 +1,75 @@ +///////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2001 Tom Barry. All rights reserved. +///////////////////////////////////////////////////////////////////////////// +// +// This file is subject to the terms of the GNU General Public License as +// published by the Free Software Foundation. A copy of this license is +// included with this software distribution in the file COPYING. If you +// do not have a copy, you may obtain a copy by writing to the Free +// Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. +// +// This software is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details +// +///////////////////////////////////////////////////////////////////////////// + +// Define a few macros for CPU dependent instructions. +// I suspect I don't really understand how the C macro preprocessor works but +// this seems to get the job done. // TRB 7/01 + +// BEFORE USING THESE YOU MUST SET: + +// #define SIMD_TYPE MMXEXT (or MMX or 3DNOW) + +// some macros for pavgb instruction +// V_PAVGB(mmr1, mmr2, mmr work register, smask) mmr2 may = mmrw if you can trash it + +#define V_PAVGB_MMX(mmr1, mmr2, mmrw, smask) \ + "movq "mmr2", "mmrw"\n\t" \ + "pand "smask", "mmrw"\n\t" \ + "psrlw $1, "mmrw"\n\t" \ + "pand "smask", "mmr1"\n\t" \ + "psrlw $1, "mmr1"\n\t" \ + "paddusb "mmrw", "mmr1"\n\t" +#define V_PAVGB_MMXEXT(mmr1, mmr2, mmrw, smask) "pavgb "mmr2", "mmr1"\n\t" +#define V_PAVGB_3DNOW(mmr1, mmr2, mmrw, smask) "pavgusb "mmr2", "mmr1"\n\t" +#define V_PAVGB(mmr1, mmr2, mmrw, smask) V_PAVGB2(mmr1, mmr2, mmrw, smask, SIMD_TYPE) +#define V_PAVGB2(mmr1, mmr2, mmrw, smask, simd_type) V_PAVGB3(mmr1, mmr2, mmrw, smask, simd_type) +#define V_PAVGB3(mmr1, mmr2, mmrw, smask, simd_type) V_PAVGB_##simd_type(mmr1, mmr2, mmrw, smask) + +// some macros for pmaxub instruction +#define V_PMAXUB_MMX(mmr1, mmr2) \ + "psubusb "mmr2", "mmr1"\n\t" \ + "paddusb "mmr2", "mmr1"\n\t" +#define V_PMAXUB_MMXEXT(mmr1, mmr2) "pmaxub "mmr2", "mmr1"\n\t" +#define V_PMAXUB_3DNOW(mmr1, mmr2) V_PMAXUB_MMX(mmr1, mmr2) // use MMX version +#define V_PMAXUB(mmr1, mmr2) V_PMAXUB2(mmr1, mmr2, SIMD_TYPE) +#define V_PMAXUB2(mmr1, mmr2, simd_type) V_PMAXUB3(mmr1, mmr2, simd_type) +#define V_PMAXUB3(mmr1, mmr2, simd_type) V_PMAXUB_##simd_type(mmr1, mmr2) + +// some macros for pminub instruction +// V_PMINUB(mmr1, mmr2, mmr work register) mmr2 may NOT = mmrw +#define V_PMINUB_MMX(mmr1, mmr2, mmrw) \ + "pcmpeqb "mmrw", "mmrw"\n\t" \ + "psubusb "mmr2", "mmrw"\n\t" \ + "paddusb "mmrw", "mmr1"\n\t" \ + "psubusb "mmrw", "mmr1"\n\t" +#define V_PMINUB_MMXEXT(mmr1, mmr2, mmrw) "pminub "mmr2", "mmr1"\n\t" +#define V_PMINUB_3DNOW(mmr1, mmr2, mmrw) V_PMINUB_MMX(mmr1, mmr2, mmrw) // use MMX version +#define V_PMINUB(mmr1, mmr2, mmrw) V_PMINUB2(mmr1, mmr2, mmrw, SIMD_TYPE) +#define V_PMINUB2(mmr1, mmr2, mmrw, simd_type) V_PMINUB3(mmr1, mmr2, mmrw, simd_type) +#define V_PMINUB3(mmr1, mmr2, mmrw, simd_type) V_PMINUB_##simd_type(mmr1, mmr2, mmrw) + +// some macros for movntq instruction +// V_MOVNTQ(mmr1, mmr2) +#define V_MOVNTQ_MMX(mmr1, mmr2) "movq "mmr2", "mmr1"\n\t" +#define V_MOVNTQ_3DNOW(mmr1, mmr2) "movq "mmr2", "mmr1"\n\t" +#define V_MOVNTQ_MMXEXT(mmr1, mmr2) "movntq "mmr2", "mmr1"\n\t" +#define V_MOVNTQ(mmr1, mmr2) V_MOVNTQ2(mmr1, mmr2, SIMD_TYPE) +#define V_MOVNTQ2(mmr1, mmr2, simd_type) V_MOVNTQ3(mmr1, mmr2, simd_type) +#define V_MOVNTQ3(mmr1, mmr2, simd_type) V_MOVNTQ_##simd_type(mmr1, mmr2) + +// end of macros + diff --git a/gst/deinterlace/tvtime/linear.c b/gst/deinterlace/tvtime/linear.c new file mode 100644 index 00000000..8a13d8a1 --- /dev/null +++ b/gst/deinterlace/tvtime/linear.c @@ -0,0 +1,214 @@ +/** + * Copyright (C) 2002 Billy Biggs <vektor@dumbterm.net>. + * Copyright (C) 2008 Sebastian Dröge <slomo@collabora.co.uk> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + */ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include "_stdint.h" +#include "gstdeinterlace.h" +#include <string.h> + +#define GST_TYPE_DEINTERLACE_METHOD_LINEAR (gst_deinterlace_method_linear_get_type ()) +#define GST_IS_DEINTERLACE_METHOD_LINEAR(obj) (G_TYPE_CHECK_INSTANCE_TYPE ((obj), GST_TYPE_DEINTERLACE_METHOD_LINEAR)) +#define GST_IS_DEINTERLACE_METHOD_LINEAR_CLASS(klass) (G_TYPE_CHECK_CLASS_TYPE ((klass), GST_TYPE_DEINTERLACE_METHOD_LINEAR)) +#define GST_DEINTERLACE_METHOD_LINEAR_GET_CLASS(obj) (G_TYPE_INSTANCE_GET_CLASS ((obj), GST_TYPE_DEINTERLACE_METHOD_LINEAR, GstDeinterlaceMethodLinearClass)) +#define GST_DEINTERLACE_METHOD_LINEAR(obj) (G_TYPE_CHECK_INSTANCE_CAST ((obj), GST_TYPE_DEINTERLACE_METHOD_LINEAR, GstDeinterlaceMethodLinear)) +#define GST_DEINTERLACE_METHOD_LINEAR_CLASS(klass) (G_TYPE_CHECK_CLASS_CAST ((klass), GST_TYPE_DEINTERLACE_METHOD_LINEAR, GstDeinterlaceMethodLinearClass)) +#define GST_DEINTERLACE_METHOD_LINEAR_CAST(obj) ((GstDeinterlaceMethodLinear*)(obj)) + +GType gst_deinterlace_method_linear_get_type (void); + +typedef GstDeinterlaceSimpleMethod GstDeinterlaceMethodLinear; + +typedef GstDeinterlaceSimpleMethodClass GstDeinterlaceMethodLinearClass; + +static void +deinterlace_scanline_linear_c (GstDeinterlaceMethod * self, + GstDeinterlace * parent, guint8 * out, + GstDeinterlaceScanlineData * scanlines, gint width) +{ + gint i; + + width *= 2; + for (i = 0; i < width; i++) + out[i] = (scanlines->t0[i] + scanlines->b0[i]) / 2; +} + +#ifdef BUILD_X86_ASM +#include "mmx.h" +static void +deinterlace_scanline_linear_mmx (GstDeinterlaceMethod * self, + GstDeinterlace * parent, guint8 * out, + GstDeinterlaceScanlineData * scanlines, gint width) +{ + const mmx_t shiftmask = { 0xfefffefffefffeffULL }; /* To avoid shifting chroma to luma. */ + int i; + guint8 *bot = scanlines->b0, *top = scanlines->t0; + + for (i = width / 16; i; --i) { + movq_m2r (*bot, mm0); + movq_m2r (*top, mm1); + movq_m2r (*(bot + 8), mm2); + movq_m2r (*(top + 8), mm3); + movq_m2r (*(bot + 16), mm4); + movq_m2r (*(top + 16), mm5); + movq_m2r (*(bot + 24), mm6); + movq_m2r (*(top + 24), mm7); + pand_m2r (shiftmask, mm0); + pand_m2r (shiftmask, mm1); + pand_m2r (shiftmask, mm2); + pand_m2r (shiftmask, mm3); + pand_m2r (shiftmask, mm4); + pand_m2r (shiftmask, mm5); + pand_m2r (shiftmask, mm6); + pand_m2r (shiftmask, mm7); + psrlw_i2r (1, mm0); + psrlw_i2r (1, mm1); + psrlw_i2r (1, mm2); + psrlw_i2r (1, mm3); + psrlw_i2r (1, mm4); + psrlw_i2r (1, mm5); + psrlw_i2r (1, mm6); + psrlw_i2r (1, mm7); + paddb_r2r (mm1, mm0); + paddb_r2r (mm3, mm2); + paddb_r2r (mm5, mm4); + paddb_r2r (mm7, mm6); + movq_r2m (mm0, *out); + movq_r2m (mm2, *(out + 8)); + movq_r2m (mm4, *(out + 16)); + movq_r2m (mm6, *(out + 24)); + out += 32; + top += 32; + bot += 32; + } + width = (width & 0xf); + + for (i = width / 4; i; --i) { + movq_m2r (*bot, mm0); + movq_m2r (*top, mm1); + pand_m2r (shiftmask, mm0); + pand_m2r (shiftmask, mm1); + psrlw_i2r (1, mm0); + psrlw_i2r (1, mm1); + paddb_r2r (mm1, mm0); + movq_r2m (mm0, *out); + out += 8; + top += 8; + bot += 8; + } + width = width & 0x7; + + /* Handle last few pixels. */ + for (i = width * 2; i; --i) { + *out++ = ((*top++) + (*bot++)) >> 1; + } + + emms (); +} + +#include "sse.h" +static void +deinterlace_scanline_linear_mmxext (GstDeinterlaceMethod * self, + GstDeinterlace * parent, guint8 * out, + GstDeinterlaceScanlineData * scanlines, gint width) +{ + gint i; + guint8 *bot = scanlines->b0, *top = scanlines->t0; + + for (i = width / 16; i; --i) { + movq_m2r (*bot, mm0); + movq_m2r (*top, mm1); + movq_m2r (*(bot + 8), mm2); + movq_m2r (*(top + 8), mm3); + movq_m2r (*(bot + 16), mm4); + movq_m2r (*(top + 16), mm5); + movq_m2r (*(bot + 24), mm6); + movq_m2r (*(top + 24), mm7); + pavgb_r2r (mm1, mm0); + pavgb_r2r (mm3, mm2); + pavgb_r2r (mm5, mm4); + pavgb_r2r (mm7, mm6); + movntq_r2m (mm0, *out); + movntq_r2m (mm2, *(out + 8)); + movntq_r2m (mm4, *(out + 16)); + movntq_r2m (mm6, *(out + 24)); + out += 32; + top += 32; + bot += 32; + } + width = (width & 0xf); + + for (i = width / 4; i; --i) { + movq_m2r (*bot, mm0); + movq_m2r (*top, mm1); + pavgb_r2r (mm1, mm0); + movntq_r2m (mm0, *out); + out += 8; + top += 8; + bot += 8; + } + width = width & 0x7; + + /* Handle last few pixels. */ + for (i = width * 2; i; --i) { + *out++ = ((*top++) + (*bot++)) >> 1; + } + + emms (); +} + +#endif + +G_DEFINE_TYPE (GstDeinterlaceMethodLinear, gst_deinterlace_method_linear, + GST_TYPE_DEINTERLACE_SIMPLE_METHOD); + +static void +gst_deinterlace_method_linear_class_init (GstDeinterlaceMethodLinearClass * + klass) +{ + GstDeinterlaceMethodClass *dim_class = (GstDeinterlaceMethodClass *) klass; + GstDeinterlaceSimpleMethodClass *dism_class = + (GstDeinterlaceSimpleMethodClass *) klass; +#ifdef BUILD_X86_ASM + guint cpu_flags = oil_cpu_get_flags (); +#endif + + dim_class->fields_required = 1; + dim_class->name = "Television: Full resolution"; + dim_class->nick = "linear"; + dim_class->latency = 0; + + dism_class->interpolate_scanline = deinterlace_scanline_linear_c; + +#ifdef BUILD_X86_ASM + if (cpu_flags & OIL_IMPL_FLAG_MMXEXT) { + dism_class->interpolate_scanline = deinterlace_scanline_linear_mmxext; + } else if (cpu_flags & OIL_IMPL_FLAG_MMXEXT) { + dism_class->interpolate_scanline = deinterlace_scanline_linear_mmx; + } +#endif +} + +static void +gst_deinterlace_method_linear_init (GstDeinterlaceMethodLinear * self) +{ +} diff --git a/gst/deinterlace/tvtime/linearblend.c b/gst/deinterlace/tvtime/linearblend.c new file mode 100644 index 00000000..5ecffd6e --- /dev/null +++ b/gst/deinterlace/tvtime/linearblend.c @@ -0,0 +1,231 @@ +/** + * Linear blend deinterlacing plugin. The idea for this algorithm came + * from the linear blend deinterlacer which originated in the mplayer + * sources. + * + * Copyright (C) 2002 Billy Biggs <vektor@dumbterm.net>. + * Copyright (C) 2008 Sebastian Dröge <slomo@collabora.co.uk> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + */ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include "_stdint.h" +#include "gstdeinterlace.h" +#include <string.h> + +#define GST_TYPE_DEINTERLACE_METHOD_LINEAR_BLEND (gst_deinterlace_method_linear_blend_get_type ()) +#define GST_IS_DEINTERLACE_METHOD_LINEAR_BLEND(obj) (G_TYPE_CHECK_INSTANCE_TYPE ((obj), GST_TYPE_DEINTERLACE_METHOD_LINEAR_BLEND)) +#define GST_IS_DEINTERLACE_METHOD_LINEAR_BLEND_CLASS(klass) (G_TYPE_CHECK_CLASS_TYPE ((klass), GST_TYPE_DEINTERLACE_METHOD_LINEAR_BLEND)) +#define GST_DEINTERLACE_METHOD_LINEAR_BLEND_GET_CLASS(obj) (G_TYPE_INSTANCE_GET_CLASS ((obj), GST_TYPE_DEINTERLACE_METHOD_LINEAR_BLEND, GstDeinterlaceMethodLinearBlendClass)) +#define GST_DEINTERLACE_METHOD_LINEAR_BLEND(obj) (G_TYPE_CHECK_INSTANCE_CAST ((obj), GST_TYPE_DEINTERLACE_METHOD_LINEAR_BLEND, GstDeinterlaceMethodLinearBlend)) +#define GST_DEINTERLACE_METHOD_LINEAR_BLEND_CLASS(klass) (G_TYPE_CHECK_CLASS_CAST ((klass), GST_TYPE_DEINTERLACE_METHOD_LINEAR_BLEND, GstDeinterlaceMethodLinearBlendClass)) +#define GST_DEINTERLACE_METHOD_LINEAR_BLEND_CAST(obj) ((GstDeinterlaceMethodLinearBlend*)(obj)) + +GType gst_deinterlace_method_linear_blend_get_type (void); + +typedef GstDeinterlaceSimpleMethod GstDeinterlaceMethodLinearBlend; + +typedef GstDeinterlaceSimpleMethodClass GstDeinterlaceMethodLinearBlendClass; + + +static inline void +deinterlace_scanline_linear_blend_c (GstDeinterlaceMethod * self, + GstDeinterlace * parent, guint8 * out, + GstDeinterlaceScanlineData * scanlines, gint width) +{ + guint8 *t0 = scanlines->t0; + guint8 *b0 = scanlines->b0; + guint8 *m1 = scanlines->m1; + + width *= 2; + + while (width--) { + *out++ = (*t0++ + *b0++ + (*m1++ << 1)) >> 2; + } +} + +static inline void +deinterlace_scanline_linear_blend2_c (GstDeinterlaceMethod * self, + GstDeinterlace * parent, guint8 * out, + GstDeinterlaceScanlineData * scanlines, gint width) +{ + guint8 *m0 = scanlines->m0; + guint8 *t1 = scanlines->t1; + guint8 *b1 = scanlines->b1; + + width *= 2; + while (width--) { + *out++ = (*t1++ + *b1++ + (*m0++ << 1)) >> 2; + } +} + +#ifdef BUILD_X86_ASM +#include "mmx.h" +static inline void +deinterlace_scanline_linear_blend_mmx (GstDeinterlaceMethod * self, + GstDeinterlace * parent, guint8 * out, + GstDeinterlaceScanlineData * scanlines, gint width) +{ + guint8 *t0 = scanlines->t0; + guint8 *b0 = scanlines->b0; + guint8 *m1 = scanlines->m1; + gint i; + + // Get width in bytes. + width *= 2; + i = width / 8; + width -= i * 8; + + pxor_r2r (mm7, mm7); + while (i--) { + movd_m2r (*t0, mm0); + movd_m2r (*b0, mm1); + movd_m2r (*m1, mm2); + + movd_m2r (*(t0 + 4), mm3); + movd_m2r (*(b0 + 4), mm4); + movd_m2r (*(m1 + 4), mm5); + + punpcklbw_r2r (mm7, mm0); + punpcklbw_r2r (mm7, mm1); + punpcklbw_r2r (mm7, mm2); + + punpcklbw_r2r (mm7, mm3); + punpcklbw_r2r (mm7, mm4); + punpcklbw_r2r (mm7, mm5); + + psllw_i2r (1, mm2); + psllw_i2r (1, mm5); + paddw_r2r (mm0, mm2); + paddw_r2r (mm3, mm5); + paddw_r2r (mm1, mm2); + paddw_r2r (mm4, mm5); + psrlw_i2r (2, mm2); + psrlw_i2r (2, mm5); + packuswb_r2r (mm2, mm2); + packuswb_r2r (mm5, mm5); + + movd_r2m (mm2, *out); + movd_r2m (mm5, *(out + 4)); + out += 8; + t0 += 8; + b0 += 8; + m1 += 8; + } + while (width--) { + *out++ = (*t0++ + *b0++ + (*m1++ << 1)) >> 2; + } + emms (); +} + +static inline void +deinterlace_scanline_linear_blend2_mmx (GstDeinterlaceMethod * self, + GstDeinterlace * parent, guint8 * out, + GstDeinterlaceScanlineData * scanlines, gint width) +{ + guint8 *m0 = scanlines->m0; + guint8 *t1 = scanlines->t1; + guint8 *b1 = scanlines->b1; + gint i; + + // Get width in bytes. + width *= 2; + i = width / 8; + width -= i * 8; + + pxor_r2r (mm7, mm7); + while (i--) { + movd_m2r (*t1, mm0); + movd_m2r (*b1, mm1); + movd_m2r (*m0, mm2); + + movd_m2r (*(t1 + 4), mm3); + movd_m2r (*(b1 + 4), mm4); + movd_m2r (*(m0 + 4), mm5); + + punpcklbw_r2r (mm7, mm0); + punpcklbw_r2r (mm7, mm1); + punpcklbw_r2r (mm7, mm2); + + punpcklbw_r2r (mm7, mm3); + punpcklbw_r2r (mm7, mm4); + punpcklbw_r2r (mm7, mm5); + + psllw_i2r (1, mm2); + psllw_i2r (1, mm5); + paddw_r2r (mm0, mm2); + paddw_r2r (mm3, mm5); + paddw_r2r (mm1, mm2); + paddw_r2r (mm4, mm5); + psrlw_i2r (2, mm2); + psrlw_i2r (2, mm5); + packuswb_r2r (mm2, mm2); + packuswb_r2r (mm5, mm5); + + movd_r2m (mm2, *out); + movd_r2m (mm5, *(out + 4)); + out += 8; + t1 += 8; + b1 += 8; + m0 += 8; + } + while (width--) { + *out++ = (*t1++ + *b1++ + (*m0++ << 1)) >> 2; + } + emms (); +} + +#endif + +G_DEFINE_TYPE (GstDeinterlaceMethodLinearBlend, + gst_deinterlace_method_linear_blend, GST_TYPE_DEINTERLACE_SIMPLE_METHOD); + +static void + gst_deinterlace_method_linear_blend_class_init + (GstDeinterlaceMethodLinearBlendClass * klass) +{ + GstDeinterlaceMethodClass *dim_class = (GstDeinterlaceMethodClass *) klass; + GstDeinterlaceSimpleMethodClass *dism_class = + (GstDeinterlaceSimpleMethodClass *) klass; +#ifdef BUILD_X86_ASM + guint cpu_flags = oil_cpu_get_flags (); +#endif + + dim_class->fields_required = 2; + dim_class->name = "Blur: Temporal"; + dim_class->nick = "linearblend"; + dim_class->latency = 0; + + dism_class->interpolate_scanline = deinterlace_scanline_linear_blend_c; + dism_class->copy_scanline = deinterlace_scanline_linear_blend2_c; + +#ifdef BUILD_X86_ASM + if (cpu_flags & OIL_IMPL_FLAG_MMX) { + dism_class->interpolate_scanline = deinterlace_scanline_linear_blend_mmx; + dism_class->copy_scanline = deinterlace_scanline_linear_blend2_mmx; + } +#endif +} + +static void +gst_deinterlace_method_linear_blend_init (GstDeinterlaceMethodLinearBlend * + self) +{ +} diff --git a/gst/deinterlace/tvtime/mmx.h b/gst/deinterlace/tvtime/mmx.h new file mode 100644 index 00000000..3627e61b --- /dev/null +++ b/gst/deinterlace/tvtime/mmx.h @@ -0,0 +1,723 @@ +/* mmx.h + + MultiMedia eXtensions GCC interface library for IA32. + + To use this library, simply include this header file + and compile with GCC. You MUST have inlining enabled + in order for mmx_ok() to work; this can be done by + simply using -O on the GCC command line. + + Compiling with -DMMX_TRACE will cause detailed trace + output to be sent to stderr for each mmx operation. + This adds lots of code, and obviously slows execution to + a crawl, but can be very useful for debugging. + + THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY + EXPRESS OR IMPLIED WARRANTIES, INCLUDING, WITHOUT + LIMITATION, THE IMPLIED WARRANTIES OF MERCHANTABILITY + AND FITNESS FOR ANY PARTICULAR PURPOSE. + + 1997-98 by H. Dietz and R. Fisher + + History: + 97-98* R.Fisher Early versions + 980501 R.Fisher Original Release + 980611* H.Dietz Rewrite, correctly implementing inlines, and + R.Fisher including direct register accesses. + 980616 R.Fisher Release of 980611 as 980616. + 980714 R.Fisher Minor corrections to Makefile, etc. + 980715 R.Fisher mmx_ok() now prevents optimizer from using + clobbered values. + mmx_ok() now checks if cpuid instruction is + available before trying to use it. + 980726* R.Fisher mm_support() searches for AMD 3DNow, Cyrix + Extended MMX, and standard MMX. It returns a + value which is positive if any of these are + supported, and can be masked with constants to + see which. mmx_ok() is now a call to this + 980726* R.Fisher Added i2r support for shift functions + 980919 R.Fisher Fixed AMD extended feature recognition bug. + 980921 R.Fisher Added definition/check for _MMX_H. + Added "float s[2]" to mmx_t for use with + 3DNow and EMMX. So same mmx_t can be used. + 981013 R.Fisher Fixed cpuid function 1 bug (looked at wrong reg) + Fixed psllq_i2r error in mmxtest.c + + * Unreleased (internal or interim) versions + + Notes: + It appears that the latest gas has the pand problem fixed, therefore + I'll undefine BROKEN_PAND by default. + String compares may be quicker than the multiple test/jumps in vendor + test sequence in mmx_ok(), but I'm not concerned with that right now. + + Acknowledgments: + Jussi Laako for pointing out the errors ultimately found to be + connected to the failure to notify the optimizer of clobbered values. + Roger Hardiman for reminding us that CPUID isn't everywhere, and that + someone may actually try to use this on a machine without CPUID. + Also for suggesting code for checking this. + Robert Dale for pointing out the AMD recognition bug. + Jimmy Mayfield and Carl Witty for pointing out the Intel recognition + bug. + Carl Witty for pointing out the psllq_i2r test bug. +*/ + +#ifndef _MMX_H +#define _MMX_H + +/*#define MMX_TRACE */ + +/* Warning: at this writing, the version of GAS packaged + with most Linux distributions does not handle the + parallel AND operation mnemonic correctly. If the + symbol BROKEN_PAND is defined, a slower alternative + coding will be used. If execution of mmxtest results + in an illegal instruction fault, define this symbol. +*/ +#undef BROKEN_PAND + + +/* The type of an value that fits in an MMX register + (note that long long constant values MUST be suffixed + by LL and unsigned long long values by ULL, lest + they be truncated by the compiler) +*/ +typedef union { + long long q; /* Quadword (64-bit) value */ + unsigned long long uq; /* Unsigned Quadword */ + int d[2]; /* 2 Doubleword (32-bit) values */ + unsigned int ud[2]; /* 2 Unsigned Doubleword */ + short w[4]; /* 4 Word (16-bit) values */ + unsigned short uw[4]; /* 4 Unsigned Word */ + char b[8]; /* 8 Byte (8-bit) values */ + unsigned char ub[8]; /* 8 Unsigned Byte */ + float s[2]; /* Single-precision (32-bit) value */ +} mmx_t; + + +/* Function to test if multimedia instructions are supported... +*/ +inline extern int +mm_support(void) +{ + /* Returns 1 if MMX instructions are supported, + 3 if Cyrix MMX and Extended MMX instructions are supported + 5 if AMD MMX and 3DNow! instructions are supported + 0 if hardware does not support any of these + */ + register int rval = 0; + + __asm__ __volatile__ ( + /* See if CPUID instruction is supported ... */ + /* ... Get copies of EFLAGS into eax and ecx */ + "pushf\n\t" + "popl %%eax\n\t" + "movl %%eax, %%ecx\n\t" + + /* ... Toggle the ID bit in one copy and store */ + /* to the EFLAGS reg */ + "xorl $0x200000, %%eax\n\t" + "push %%eax\n\t" + "popf\n\t" + + /* ... Get the (hopefully modified) EFLAGS */ + "pushf\n\t" + "popl %%eax\n\t" + + /* ... Compare and test result */ + "xorl %%eax, %%ecx\n\t" + "testl $0x200000, %%ecx\n\t" + "jz NotSupported1\n\t" /* Nothing supported */ + + + /* Get standard CPUID information, and + go to a specific vendor section */ + "movl $0, %%eax\n\t" + "cpuid\n\t" + + /* Check for Intel */ + "cmpl $0x756e6547, %%ebx\n\t" + "jne TryAMD\n\t" + "cmpl $0x49656e69, %%edx\n\t" + "jne TryAMD\n\t" + "cmpl $0x6c65746e, %%ecx\n" + "jne TryAMD\n\t" + "jmp Intel\n\t" + + /* Check for AMD */ + "\nTryAMD:\n\t" + "cmpl $0x68747541, %%ebx\n\t" + "jne TryCyrix\n\t" + "cmpl $0x69746e65, %%edx\n\t" + "jne TryCyrix\n\t" + "cmpl $0x444d4163, %%ecx\n" + "jne TryCyrix\n\t" + "jmp AMD\n\t" + + /* Check for Cyrix */ + "\nTryCyrix:\n\t" + "cmpl $0x69727943, %%ebx\n\t" + "jne NotSupported2\n\t" + "cmpl $0x736e4978, %%edx\n\t" + "jne NotSupported3\n\t" + "cmpl $0x64616574, %%ecx\n\t" + "jne NotSupported4\n\t" + /* Drop through to Cyrix... */ + + + /* Cyrix Section */ + /* See if extended CPUID is supported */ + "movl $0x80000000, %%eax\n\t" + "cpuid\n\t" + "cmpl $0x80000000, %%eax\n\t" + "jl MMXtest\n\t" /* Try standard CPUID instead */ + + /* Extended CPUID supported, so get extended features */ + "movl $0x80000001, %%eax\n\t" + "cpuid\n\t" + "testl $0x00800000, %%eax\n\t" /* Test for MMX */ + "jz NotSupported5\n\t" /* MMX not supported */ + "testl $0x01000000, %%eax\n\t" /* Test for Ext'd MMX */ + "jnz EMMXSupported\n\t" + "movl $1, %0:\n\n\t" /* MMX Supported */ + "jmp Return\n\n" + "EMMXSupported:\n\t" + "movl $3, %0:\n\n\t" /* EMMX and MMX Supported */ + "jmp Return\n\t" + + + /* AMD Section */ + "AMD:\n\t" + + /* See if extended CPUID is supported */ + "movl $0x80000000, %%eax\n\t" + "cpuid\n\t" + "cmpl $0x80000000, %%eax\n\t" + "jl MMXtest\n\t" /* Try standard CPUID instead */ + + /* Extended CPUID supported, so get extended features */ + "movl $0x80000001, %%eax\n\t" + "cpuid\n\t" + "testl $0x00800000, %%edx\n\t" /* Test for MMX */ + "jz NotSupported6\n\t" /* MMX not supported */ + "testl $0x80000000, %%edx\n\t" /* Test for 3DNow! */ + "jnz ThreeDNowSupported\n\t" + "movl $1, %0:\n\n\t" /* MMX Supported */ + "jmp Return\n\n" + "ThreeDNowSupported:\n\t" + "movl $5, %0:\n\n\t" /* 3DNow! and MMX Supported */ + "jmp Return\n\t" + + + /* Intel Section */ + "Intel:\n\t" + + /* Check for MMX */ + "MMXtest:\n\t" + "movl $1, %%eax\n\t" + "cpuid\n\t" + "testl $0x00800000, %%edx\n\t" /* Test for MMX */ + "jz NotSupported7\n\t" /* MMX Not supported */ + "movl $1, %0:\n\n\t" /* MMX Supported */ + "jmp Return\n\t" + + /* Nothing supported */ + "\nNotSupported1:\n\t" + "#movl $101, %0:\n\n\t" + "\nNotSupported2:\n\t" + "#movl $102, %0:\n\n\t" + "\nNotSupported3:\n\t" + "#movl $103, %0:\n\n\t" + "\nNotSupported4:\n\t" + "#movl $104, %0:\n\n\t" + "\nNotSupported5:\n\t" + "#movl $105, %0:\n\n\t" + "\nNotSupported6:\n\t" + "#movl $106, %0:\n\n\t" + "\nNotSupported7:\n\t" + "#movl $107, %0:\n\n\t" + "movl $0, %0:\n\n\t" + + "Return:\n\t" + : "=a" (rval) + : /* no input */ + : "eax", "ebx", "ecx", "edx" + ); + + /* Return */ + return(rval); +} + +/* Function to test if mmx instructions are supported... +*/ +inline extern int +mmx_ok(void) +{ + /* Returns 1 if MMX instructions are supported, 0 otherwise */ + return ( mm_support() & 0x1 ); +} + + +/* Helper functions for the instruction macros that follow... + (note that memory-to-register, m2r, instructions are nearly + as efficient as register-to-register, r2r, instructions; + however, memory-to-memory instructions are really simulated + as a convenience, and are only 1/3 as efficient) +*/ +#ifdef MMX_TRACE + +/* Include the stuff for printing a trace to stderr... +*/ + +#include <stdio.h> + +#define mmx_i2r(op, imm, reg) \ + { \ + mmx_t mmx_trace; \ + mmx_trace = (imm); \ + fprintf(stderr, #op "_i2r(" #imm "=0x%016llx, ", mmx_trace.q); \ + __asm__ __volatile__ ("movq %%" #reg ", %0" \ + : "=X" (mmx_trace) \ + : /* nothing */ ); \ + fprintf(stderr, #reg "=0x%016llx) => ", mmx_trace.q); \ + __asm__ __volatile__ (#op " %0, %%" #reg \ + : /* nothing */ \ + : "X" (imm)); \ + __asm__ __volatile__ ("movq %%" #reg ", %0" \ + : "=X" (mmx_trace) \ + : /* nothing */ ); \ + fprintf(stderr, #reg "=0x%016llx\n", mmx_trace.q); \ + } + +#define mmx_m2r(op, mem, reg) \ + { \ + mmx_t mmx_trace; \ + mmx_trace = (mem); \ + fprintf(stderr, #op "_m2r(" #mem "=0x%016llx, ", mmx_trace.q); \ + __asm__ __volatile__ ("movq %%" #reg ", %0" \ + : "=X" (mmx_trace) \ + : /* nothing */ ); \ + fprintf(stderr, #reg "=0x%016llx) => ", mmx_trace.q); \ + __asm__ __volatile__ (#op " %0, %%" #reg \ + : /* nothing */ \ + : "X" (mem)); \ + __asm__ __volatile__ ("movq %%" #reg ", %0" \ + : "=X" (mmx_trace) \ + : /* nothing */ ); \ + fprintf(stderr, #reg "=0x%016llx\n", mmx_trace.q); \ + } + +#define mmx_r2m(op, reg, mem) \ + { \ + mmx_t mmx_trace; \ + __asm__ __volatile__ ("movq %%" #reg ", %0" \ + : "=X" (mmx_trace) \ + : /* nothing */ ); \ + fprintf(stderr, #op "_r2m(" #reg "=0x%016llx, ", mmx_trace.q); \ + mmx_trace = (mem); \ + fprintf(stderr, #mem "=0x%016llx) => ", mmx_trace.q); \ + __asm__ __volatile__ (#op " %%" #reg ", %0" \ + : "=X" (mem) \ + : /* nothing */ ); \ + mmx_trace = (mem); \ + fprintf(stderr, #mem "=0x%016llx\n", mmx_trace.q); \ + } + +#define mmx_r2r(op, regs, regd) \ + { \ + mmx_t mmx_trace; \ + __asm__ __volatile__ ("movq %%" #regs ", %0" \ + : "=X" (mmx_trace) \ + : /* nothing */ ); \ + fprintf(stderr, #op "_r2r(" #regs "=0x%016llx, ", mmx_trace.q); \ + __asm__ __volatile__ ("movq %%" #regd ", %0" \ + : "=X" (mmx_trace) \ + : /* nothing */ ); \ + fprintf(stderr, #regd "=0x%016llx) => ", mmx_trace.q); \ + __asm__ __volatile__ (#op " %" #regs ", %" #regd); \ + __asm__ __volatile__ ("movq %%" #regd ", %0" \ + : "=X" (mmx_trace) \ + : /* nothing */ ); \ + fprintf(stderr, #regd "=0x%016llx\n", mmx_trace.q); \ + } + +#define mmx_m2m(op, mems, memd) \ + { \ + mmx_t mmx_trace; \ + mmx_trace = (mems); \ + fprintf(stderr, #op "_m2m(" #mems "=0x%016llx, ", mmx_trace.q); \ + mmx_trace = (memd); \ + fprintf(stderr, #memd "=0x%016llx) => ", mmx_trace.q); \ + __asm__ __volatile__ ("movq %0, %%mm0\n\t" \ + #op " %1, %%mm0\n\t" \ + "movq %%mm0, %0" \ + : "=X" (memd) \ + : "X" (mems)); \ + mmx_trace = (memd); \ + fprintf(stderr, #memd "=0x%016llx\n", mmx_trace.q); \ + } + +#else + +/* These macros are a lot simpler without the tracing... +*/ + +#define mmx_i2r(op, imm, reg) \ + __asm__ __volatile__ (#op " $" #imm ", %%" #reg \ + : /* nothing */ \ + : /* nothing */); + +#define mmx_m2r(op, mem, reg) \ + __asm__ __volatile__ (#op " %0, %%" #reg \ + : /* nothing */ \ + : "m" (mem)) + +#define mmx_r2m(op, reg, mem) \ + __asm__ __volatile__ (#op " %%" #reg ", %0" \ + : "=m" (mem) \ + : /* nothing */ ) + +#define mmx_r2r(op, regs, regd) \ + __asm__ __volatile__ (#op " %" #regs ", %" #regd) + +#define mmx_m2m(op, mems, memd) \ + __asm__ __volatile__ ("movq %0, %%mm0\n\t" \ + #op " %1, %%mm0\n\t" \ + "movq %%mm0, %0" \ + : "=m" (memd) \ + : "m" (mems)) + +#endif + + +/* 1x64 MOVe Quadword + (this is both a load and a store... + in fact, it is the only way to store) +*/ +#define movq_m2r(var, reg) mmx_m2r(movq, var, reg) +#define movq_r2m(reg, var) mmx_r2m(movq, reg, var) +#define movq_r2r(regs, regd) mmx_r2r(movq, regs, regd) +#define movq(vars, vard) \ + __asm__ __volatile__ ("movq %1, %%mm0\n\t" \ + "movq %%mm0, %0" \ + : "=X" (vard) \ + : "X" (vars)) + + +/* 1x32 MOVe Doubleword + (like movq, this is both load and store... + but is most useful for moving things between + mmx registers and ordinary registers) +*/ +#define movd_m2r(var, reg) mmx_m2r(movd, var, reg) +#define movd_r2m(reg, var) mmx_r2m(movd, reg, var) +#define movd_r2r(regs, regd) mmx_r2r(movd, regs, regd) +#define movd(vars, vard) \ + __asm__ __volatile__ ("movd %1, %%mm0\n\t" \ + "movd %%mm0, %0" \ + : "=X" (vard) \ + : "X" (vars)) + + +/* 2x32, 4x16, and 8x8 Parallel ADDs +*/ +#define paddd_m2r(var, reg) mmx_m2r(paddd, var, reg) +#define paddd_r2r(regs, regd) mmx_r2r(paddd, regs, regd) +#define paddd(vars, vard) mmx_m2m(paddd, vars, vard) + +#define paddw_m2r(var, reg) mmx_m2r(paddw, var, reg) +#define paddw_r2r(regs, regd) mmx_r2r(paddw, regs, regd) +#define paddw(vars, vard) mmx_m2m(paddw, vars, vard) + +#define paddb_m2r(var, reg) mmx_m2r(paddb, var, reg) +#define paddb_r2r(regs, regd) mmx_r2r(paddb, regs, regd) +#define paddb(vars, vard) mmx_m2m(paddb, vars, vard) + + +/* 4x16 and 8x8 Parallel ADDs using Saturation arithmetic +*/ +#define paddsw_m2r(var, reg) mmx_m2r(paddsw, var, reg) +#define paddsw_r2r(regs, regd) mmx_r2r(paddsw, regs, regd) +#define paddsw(vars, vard) mmx_m2m(paddsw, vars, vard) + +#define paddsb_m2r(var, reg) mmx_m2r(paddsb, var, reg) +#define paddsb_r2r(regs, regd) mmx_r2r(paddsb, regs, regd) +#define paddsb(vars, vard) mmx_m2m(paddsb, vars, vard) + + +/* 4x16 and 8x8 Parallel ADDs using Unsigned Saturation arithmetic +*/ +#define paddusw_m2r(var, reg) mmx_m2r(paddusw, var, reg) +#define paddusw_r2r(regs, regd) mmx_r2r(paddusw, regs, regd) +#define paddusw(vars, vard) mmx_m2m(paddusw, vars, vard) + +#define paddusb_m2r(var, reg) mmx_m2r(paddusb, var, reg) +#define paddusb_r2r(regs, regd) mmx_r2r(paddusb, regs, regd) +#define paddusb(vars, vard) mmx_m2m(paddusb, vars, vard) + + +/* 2x32, 4x16, and 8x8 Parallel SUBs +*/ +#define psubd_m2r(var, reg) mmx_m2r(psubd, var, reg) +#define psubd_r2r(regs, regd) mmx_r2r(psubd, regs, regd) +#define psubd(vars, vard) mmx_m2m(psubd, vars, vard) + +#define psubw_m2r(var, reg) mmx_m2r(psubw, var, reg) +#define psubw_r2r(regs, regd) mmx_r2r(psubw, regs, regd) +#define psubw(vars, vard) mmx_m2m(psubw, vars, vard) + +#define psubb_m2r(var, reg) mmx_m2r(psubb, var, reg) +#define psubb_r2r(regs, regd) mmx_r2r(psubb, regs, regd) +#define psubb(vars, vard) mmx_m2m(psubb, vars, vard) + + +/* 4x16 and 8x8 Parallel SUBs using Saturation arithmetic +*/ +#define psubsw_m2r(var, reg) mmx_m2r(psubsw, var, reg) +#define psubsw_r2r(regs, regd) mmx_r2r(psubsw, regs, regd) +#define psubsw(vars, vard) mmx_m2m(psubsw, vars, vard) + +#define psubsb_m2r(var, reg) mmx_m2r(psubsb, var, reg) +#define psubsb_r2r(regs, regd) mmx_r2r(psubsb, regs, regd) +#define psubsb(vars, vard) mmx_m2m(psubsb, vars, vard) + + +/* 4x16 and 8x8 Parallel SUBs using Unsigned Saturation arithmetic +*/ +#define psubusw_m2r(var, reg) mmx_m2r(psubusw, var, reg) +#define psubusw_r2r(regs, regd) mmx_r2r(psubusw, regs, regd) +#define psubusw(vars, vard) mmx_m2m(psubusw, vars, vard) + +#define psubusb_m2r(var, reg) mmx_m2r(psubusb, var, reg) +#define psubusb_r2r(regs, regd) mmx_r2r(psubusb, regs, regd) +#define psubusb(vars, vard) mmx_m2m(psubusb, vars, vard) + + +/* 4x16 Parallel MULs giving Low 4x16 portions of results +*/ +#define pmullw_m2r(var, reg) mmx_m2r(pmullw, var, reg) +#define pmullw_r2r(regs, regd) mmx_r2r(pmullw, regs, regd) +#define pmullw(vars, vard) mmx_m2m(pmullw, vars, vard) + + +/* 4x16 Parallel MULs giving High 4x16 portions of results +*/ +#define pmulhw_m2r(var, reg) mmx_m2r(pmulhw, var, reg) +#define pmulhw_r2r(regs, regd) mmx_r2r(pmulhw, regs, regd) +#define pmulhw(vars, vard) mmx_m2m(pmulhw, vars, vard) + + +/* 4x16->2x32 Parallel Mul-ADD + (muls like pmullw, then adds adjacent 16-bit fields + in the multiply result to make the final 2x32 result) +*/ +#define pmaddwd_m2r(var, reg) mmx_m2r(pmaddwd, var, reg) +#define pmaddwd_r2r(regs, regd) mmx_r2r(pmaddwd, regs, regd) +#define pmaddwd(vars, vard) mmx_m2m(pmaddwd, vars, vard) + + +/* 1x64 bitwise AND +*/ +#ifdef BROKEN_PAND +#define pand_m2r(var, reg) \ + { \ + mmx_m2r(pandn, (mmx_t) -1LL, reg); \ + mmx_m2r(pandn, var, reg); \ + } +#define pand_r2r(regs, regd) \ + { \ + mmx_m2r(pandn, (mmx_t) -1LL, regd); \ + mmx_r2r(pandn, regs, regd); \ + } +#define pand(vars, vard) \ + { \ + movq_m2r(vard, mm0); \ + mmx_m2r(pandn, (mmx_t) -1LL, mm0); \ + mmx_m2r(pandn, vars, mm0); \ + movq_r2m(mm0, vard); \ + } +#else +#define pand_m2r(var, reg) mmx_m2r(pand, var, reg) +#define pand_r2r(regs, regd) mmx_r2r(pand, regs, regd) +#define pand(vars, vard) mmx_m2m(pand, vars, vard) +#endif + + +/* 1x64 bitwise AND with Not the destination +*/ +#define pandn_m2r(var, reg) mmx_m2r(pandn, var, reg) +#define pandn_r2r(regs, regd) mmx_r2r(pandn, regs, regd) +#define pandn(vars, vard) mmx_m2m(pandn, vars, vard) + + +/* 1x64 bitwise OR +*/ +#define por_m2r(var, reg) mmx_m2r(por, var, reg) +#define por_r2r(regs, regd) mmx_r2r(por, regs, regd) +#define por(vars, vard) mmx_m2m(por, vars, vard) + + +/* 1x64 bitwise eXclusive OR +*/ +#define pxor_m2r(var, reg) mmx_m2r(pxor, var, reg) +#define pxor_r2r(regs, regd) mmx_r2r(pxor, regs, regd) +#define pxor(vars, vard) mmx_m2m(pxor, vars, vard) + + +/* 2x32, 4x16, and 8x8 Parallel CoMPare for EQuality + (resulting fields are either 0 or -1) +*/ +#define pcmpeqd_m2r(var, reg) mmx_m2r(pcmpeqd, var, reg) +#define pcmpeqd_r2r(regs, regd) mmx_r2r(pcmpeqd, regs, regd) +#define pcmpeqd(vars, vard) mmx_m2m(pcmpeqd, vars, vard) + +#define pcmpeqw_m2r(var, reg) mmx_m2r(pcmpeqw, var, reg) +#define pcmpeqw_r2r(regs, regd) mmx_r2r(pcmpeqw, regs, regd) +#define pcmpeqw(vars, vard) mmx_m2m(pcmpeqw, vars, vard) + +#define pcmpeqb_m2r(var, reg) mmx_m2r(pcmpeqb, var, reg) +#define pcmpeqb_r2r(regs, regd) mmx_r2r(pcmpeqb, regs, regd) +#define pcmpeqb(vars, vard) mmx_m2m(pcmpeqb, vars, vard) + + +/* 2x32, 4x16, and 8x8 Parallel CoMPare for Greater Than + (resulting fields are either 0 or -1) +*/ +#define pcmpgtd_m2r(var, reg) mmx_m2r(pcmpgtd, var, reg) +#define pcmpgtd_r2r(regs, regd) mmx_r2r(pcmpgtd, regs, regd) +#define pcmpgtd(vars, vard) mmx_m2m(pcmpgtd, vars, vard) + +#define pcmpgtw_m2r(var, reg) mmx_m2r(pcmpgtw, var, reg) +#define pcmpgtw_r2r(regs, regd) mmx_r2r(pcmpgtw, regs, regd) +#define pcmpgtw(vars, vard) mmx_m2m(pcmpgtw, vars, vard) + +#define pcmpgtb_m2r(var, reg) mmx_m2r(pcmpgtb, var, reg) +#define pcmpgtb_r2r(regs, regd) mmx_r2r(pcmpgtb, regs, regd) +#define pcmpgtb(vars, vard) mmx_m2m(pcmpgtb, vars, vard) + + +/* 1x64, 2x32, and 4x16 Parallel Shift Left Logical +*/ +#define psllq_i2r(imm, reg) mmx_i2r(psllq, imm, reg) +#define psllq_m2r(var, reg) mmx_m2r(psllq, var, reg) +#define psllq_r2r(regs, regd) mmx_r2r(psllq, regs, regd) +#define psllq(vars, vard) mmx_m2m(psllq, vars, vard) + +#define pslld_i2r(imm, reg) mmx_i2r(pslld, imm, reg) +#define pslld_m2r(var, reg) mmx_m2r(pslld, var, reg) +#define pslld_r2r(regs, regd) mmx_r2r(pslld, regs, regd) +#define pslld(vars, vard) mmx_m2m(pslld, vars, vard) + +#define psllw_i2r(imm, reg) mmx_i2r(psllw, imm, reg) +#define psllw_m2r(var, reg) mmx_m2r(psllw, var, reg) +#define psllw_r2r(regs, regd) mmx_r2r(psllw, regs, regd) +#define psllw(vars, vard) mmx_m2m(psllw, vars, vard) + + +/* 1x64, 2x32, and 4x16 Parallel Shift Right Logical +*/ +#define psrlq_i2r(imm, reg) mmx_i2r(psrlq, imm, reg) +#define psrlq_m2r(var, reg) mmx_m2r(psrlq, var, reg) +#define psrlq_r2r(regs, regd) mmx_r2r(psrlq, regs, regd) +#define psrlq(vars, vard) mmx_m2m(psrlq, vars, vard) + +#define psrld_i2r(imm, reg) mmx_i2r(psrld, imm, reg) +#define psrld_m2r(var, reg) mmx_m2r(psrld, var, reg) +#define psrld_r2r(regs, regd) mmx_r2r(psrld, regs, regd) +#define psrld(vars, vard) mmx_m2m(psrld, vars, vard) + +#define psrlw_i2r(imm, reg) mmx_i2r(psrlw, imm, reg) +#define psrlw_m2r(var, reg) mmx_m2r(psrlw, var, reg) +#define psrlw_r2r(regs, regd) mmx_r2r(psrlw, regs, regd) +#define psrlw(vars, vard) mmx_m2m(psrlw, vars, vard) + + +/* 2x32 and 4x16 Parallel Shift Right Arithmetic +*/ +#define psrad_i2r(imm, reg) mmx_i2r(psrad, imm, reg) +#define psrad_m2r(var, reg) mmx_m2r(psrad, var, reg) +#define psrad_r2r(regs, regd) mmx_r2r(psrad, regs, regd) +#define psrad(vars, vard) mmx_m2m(psrad, vars, vard) + +#define psraw_i2r(imm, reg) mmx_i2r(psraw, imm, reg) +#define psraw_m2r(var, reg) mmx_m2r(psraw, var, reg) +#define psraw_r2r(regs, regd) mmx_r2r(psraw, regs, regd) +#define psraw(vars, vard) mmx_m2m(psraw, vars, vard) + + +/* 2x32->4x16 and 4x16->8x8 PACK and Signed Saturate + (packs source and dest fields into dest in that order) +*/ +#define packssdw_m2r(var, reg) mmx_m2r(packssdw, var, reg) +#define packssdw_r2r(regs, regd) mmx_r2r(packssdw, regs, regd) +#define packssdw(vars, vard) mmx_m2m(packssdw, vars, vard) + +#define packsswb_m2r(var, reg) mmx_m2r(packsswb, var, reg) +#define packsswb_r2r(regs, regd) mmx_r2r(packsswb, regs, regd) +#define packsswb(vars, vard) mmx_m2m(packsswb, vars, vard) + + +/* 4x16->8x8 PACK and Unsigned Saturate + (packs source and dest fields into dest in that order) +*/ +#define packuswb_m2r(var, reg) mmx_m2r(packuswb, var, reg) +#define packuswb_r2r(regs, regd) mmx_r2r(packuswb, regs, regd) +#define packuswb(vars, vard) mmx_m2m(packuswb, vars, vard) + + +/* 2x32->1x64, 4x16->2x32, and 8x8->4x16 UNPaCK Low + (interleaves low half of dest with low half of source + as padding in each result field) +*/ +#define punpckldq_m2r(var, reg) mmx_m2r(punpckldq, var, reg) +#define punpckldq_r2r(regs, regd) mmx_r2r(punpckldq, regs, regd) +#define punpckldq(vars, vard) mmx_m2m(punpckldq, vars, vard) + +#define punpcklwd_m2r(var, reg) mmx_m2r(punpcklwd, var, reg) +#define punpcklwd_r2r(regs, regd) mmx_r2r(punpcklwd, regs, regd) +#define punpcklwd(vars, vard) mmx_m2m(punpcklwd, vars, vard) + +#define punpcklbw_m2r(var, reg) mmx_m2r(punpcklbw, var, reg) +#define punpcklbw_r2r(regs, regd) mmx_r2r(punpcklbw, regs, regd) +#define punpcklbw(vars, vard) mmx_m2m(punpcklbw, vars, vard) + + +/* 2x32->1x64, 4x16->2x32, and 8x8->4x16 UNPaCK High + (interleaves high half of dest with high half of source + as padding in each result field) +*/ +#define punpckhdq_m2r(var, reg) mmx_m2r(punpckhdq, var, reg) +#define punpckhdq_r2r(regs, regd) mmx_r2r(punpckhdq, regs, regd) +#define punpckhdq(vars, vard) mmx_m2m(punpckhdq, vars, vard) + +#define punpckhwd_m2r(var, reg) mmx_m2r(punpckhwd, var, reg) +#define punpckhwd_r2r(regs, regd) mmx_r2r(punpckhwd, regs, regd) +#define punpckhwd(vars, vard) mmx_m2m(punpckhwd, vars, vard) + +#define punpckhbw_m2r(var, reg) mmx_m2r(punpckhbw, var, reg) +#define punpckhbw_r2r(regs, regd) mmx_r2r(punpckhbw, regs, regd) +#define punpckhbw(vars, vard) mmx_m2m(punpckhbw, vars, vard) + + +/* Empty MMx State + (used to clean-up when going from mmx to float use + of the registers that are shared by both; note that + there is no float-to-mmx operation needed, because + only the float tag word info is corruptible) +*/ +#ifdef MMX_TRACE + +#define emms() \ + { \ + fprintf(stderr, "emms()\n"); \ + __asm__ __volatile__ ("emms"); \ + } + +#else + +#define emms() __asm__ __volatile__ ("emms") + +#endif + +#endif diff --git a/gst/deinterlace/tvtime/plugins.h b/gst/deinterlace/tvtime/plugins.h new file mode 100644 index 00000000..8fb01af5 --- /dev/null +++ b/gst/deinterlace/tvtime/plugins.h @@ -0,0 +1,54 @@ +/* + * + * GStreamer + * Copyright (C) 2004 Billy Biggs <vektor@dumbterm.net> + * Copyright (C) 2008 Sebastian Dröge <slomo@collabora.co.uk> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public + * License along with this library; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + */ + +/* + * Relicensed for GStreamer from GPL to LGPL with permit from Billy Biggs. + * See: http://bugzilla.gnome.org/show_bug.cgi?id=163578 + */ + +#ifndef TVTIME_PLUGINS_H_INCLUDED +#define TVTIME_PLUGINS_H_INCLUDED + +#define GST_TYPE_DEINTERLACE_TOMSMOCOMP (gst_deinterlace_method_tomsmocomp_get_type ()) +#define GST_TYPE_DEINTERLACE_GREEDY_H (gst_deinterlace_method_greedy_h_get_type ()) +#define GST_TYPE_DEINTERLACE_GREEDY_L (gst_deinterlace_method_greedy_l_get_type ()) +#define GST_TYPE_DEINTERLACE_VFIR (gst_deinterlace_method_vfir_get_type ()) +#define GST_TYPE_DEINTERLACE_LINEAR (gst_deinterlace_method_linear_get_type ()) +#define GST_TYPE_DEINTERLACE_LINEAR_BLEND (gst_deinterlace_method_linear_blend_get_type ()) +#define GST_TYPE_DEINTERLACE_SCALER_BOB (gst_deinterlace_method_scaler_bob_get_type ()) +#define GST_TYPE_DEINTERLACE_WEAVE (gst_deinterlace_method_weave_get_type ()) +#define GST_TYPE_DEINTERLACE_WEAVE_TFF (gst_deinterlace_method_weave_tff_get_type ()) +#define GST_TYPE_DEINTERLACE_WEAVE_BFF (gst_deinterlace_method_weave_bff_get_type ()) + +GType gst_deinterlace_method_tomsmocomp_get_type (void); +GType gst_deinterlace_method_greedy_h_get_type (void); +GType gst_deinterlace_method_greedy_l_get_type (void); +GType gst_deinterlace_method_vfir_get_type (void); + +GType gst_deinterlace_method_linear_get_type (void); +GType gst_deinterlace_method_linear_blend_get_type (void); +GType gst_deinterlace_method_scaler_bob_get_type (void); +GType gst_deinterlace_method_weave_get_type (void); +GType gst_deinterlace_method_weave_tff_get_type (void); +GType gst_deinterlace_method_weave_bff_get_type (void); + +#endif /* TVTIME_PLUGINS_H_INCLUDED */ diff --git a/gst/deinterlace/tvtime/scalerbob.c b/gst/deinterlace/tvtime/scalerbob.c new file mode 100644 index 00000000..a7bca169 --- /dev/null +++ b/gst/deinterlace/tvtime/scalerbob.c @@ -0,0 +1,74 @@ +/** + * Double lines + * Copyright (C) 2008 Sebastian Dröge <sebastian.droege@collabora.co.uk> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + */ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include "_stdint.h" +#include "gstdeinterlace.h" +#include <string.h> + +#define GST_TYPE_DEINTERLACE_METHOD_SCALER_BOB (gst_deinterlace_method_scaler_bob_get_type ()) +#define GST_IS_DEINTERLACE_METHOD_SCALER_BOB(obj) (G_TYPE_CHECK_INSTANCE_TYPE ((obj), GST_TYPE_DEINTERLACE_METHOD_SCALER_BOB)) +#define GST_IS_DEINTERLACE_METHOD_SCALER_BOB_CLASS(klass) (G_TYPE_CHECK_CLASS_TYPE ((klass), GST_TYPE_DEINTERLACE_METHOD_SCALER_BOB)) +#define GST_DEINTERLACE_METHOD_SCALER_BOB_GET_CLASS(obj) (G_TYPE_INSTANCE_GET_CLASS ((obj), GST_TYPE_DEINTERLACE_METHOD_SCALER_BOB, GstDeinterlaceMethodScalerBobClass)) +#define GST_DEINTERLACE_METHOD_SCALER_BOB(obj) (G_TYPE_CHECK_INSTANCE_CAST ((obj), GST_TYPE_DEINTERLACE_METHOD_SCALER_BOB, GstDeinterlaceMethodScalerBob)) +#define GST_DEINTERLACE_METHOD_SCALER_BOB_CLASS(klass) (G_TYPE_CHECK_CLASS_CAST ((klass), GST_TYPE_DEINTERLACE_METHOD_SCALER_BOB, GstDeinterlaceMethodScalerBobClass)) +#define GST_DEINTERLACE_METHOD_SCALER_BOB_CAST(obj) ((GstDeinterlaceMethodScalerBob*)(obj)) + +GType gst_deinterlace_method_scaler_bob_get_type (void); + +typedef GstDeinterlaceSimpleMethod GstDeinterlaceMethodScalerBob; + +typedef GstDeinterlaceSimpleMethodClass GstDeinterlaceMethodScalerBobClass; + + +static void +deinterlace_scanline_scaler_bob (GstDeinterlaceMethod * self, + GstDeinterlace * parent, guint8 * out, + GstDeinterlaceScanlineData * scanlines, gint width) +{ + oil_memcpy (out, scanlines->t0, parent->row_stride); +} + +G_DEFINE_TYPE (GstDeinterlaceMethodScalerBob, gst_deinterlace_method_scaler_bob, + GST_TYPE_DEINTERLACE_SIMPLE_METHOD); + +static void +gst_deinterlace_method_scaler_bob_class_init (GstDeinterlaceMethodScalerBobClass + * klass) +{ + GstDeinterlaceMethodClass *dim_class = (GstDeinterlaceMethodClass *) klass; + GstDeinterlaceSimpleMethodClass *dism_class = + (GstDeinterlaceSimpleMethodClass *) klass; + + dim_class->fields_required = 1; + dim_class->name = "Double lines"; + dim_class->nick = "scalerbob"; + dim_class->latency = 0; + + dism_class->interpolate_scanline = deinterlace_scanline_scaler_bob; +} + +static void +gst_deinterlace_method_scaler_bob_init (GstDeinterlaceMethodScalerBob * self) +{ +} diff --git a/gst/deinterlace/tvtime/sse.h b/gst/deinterlace/tvtime/sse.h new file mode 100644 index 00000000..2e00ee0c --- /dev/null +++ b/gst/deinterlace/tvtime/sse.h @@ -0,0 +1,992 @@ +/* sse.h + + Streaming SIMD Extenstions (a.k.a. Katmai New Instructions) + GCC interface library for IA32. + + To use this library, simply include this header file + and compile with GCC. You MUST have inlining enabled + in order for sse_ok() to work; this can be done by + simply using -O on the GCC command line. + + Compiling with -DSSE_TRACE will cause detailed trace + output to be sent to stderr for each sse operation. + This adds lots of code, and obviously slows execution to + a crawl, but can be very useful for debugging. + + THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY + EXPRESS OR IMPLIED WARRANTIES, INCLUDING, WITHOUT + LIMITATION, THE IMPLIED WARRANTIES OF MERCHANTABILITY + AND FITNESS FOR ANY PARTICULAR PURPOSE. + + 1999 by R. Fisher + Based on libmmx by H. Dietz and R. Fisher + + Notes: + This is still extremely alpha. + Because this library depends on an assembler which understands the + SSE opcodes, you probably won't be able to use this yet. + For now, do not use TRACE versions. These both make use + of the MMX registers, not the SSE registers. This will be resolved + at a later date. + ToDo: + Rewrite TRACE macros + Major Debugging Work +*/ + +#ifndef _SSE_H +#define _SSE_H + + + +/* The type of an value that fits in an SSE register + (note that long long constant values MUST be suffixed + by LL and unsigned long long values by ULL, lest + they be truncated by the compiler) +*/ +typedef union { + float sf[4]; /* Single-precision (32-bit) value */ +} __attribute__ ((aligned (16))) sse_t; /* On a 16 byte (128-bit) boundary */ + + +#if 0 +/* Function to test if multimedia instructions are supported... +*/ +inline extern int +mm_support(void) +{ + /* Returns 1 if MMX instructions are supported, + 3 if Cyrix MMX and Extended MMX instructions are supported + 5 if AMD MMX and 3DNow! instructions are supported + 9 if MMX and SSE instructions are supported + 0 if hardware does not support any of these + */ + register int rval = 0; + + __asm__ __volatile__ ( + /* See if CPUID instruction is supported ... */ + /* ... Get copies of EFLAGS into eax and ecx */ + "pushf\n\t" + "popl %%eax\n\t" + "movl %%eax, %%ecx\n\t" + + /* ... Toggle the ID bit in one copy and store */ + /* to the EFLAGS reg */ + "xorl $0x200000, %%eax\n\t" + "push %%eax\n\t" + "popf\n\t" + + /* ... Get the (hopefully modified) EFLAGS */ + "pushf\n\t" + "popl %%eax\n\t" + + /* ... Compare and test result */ + "xorl %%eax, %%ecx\n\t" + "testl $0x200000, %%ecx\n\t" + "jz NotSupported1\n\t" /* CPUID not supported */ + + + /* Get standard CPUID information, and + go to a specific vendor section */ + "movl $0, %%eax\n\t" + "cpuid\n\t" + + /* Check for Intel */ + "cmpl $0x756e6547, %%ebx\n\t" + "jne TryAMD\n\t" + "cmpl $0x49656e69, %%edx\n\t" + "jne TryAMD\n\t" + "cmpl $0x6c65746e, %%ecx\n" + "jne TryAMD\n\t" + "jmp Intel\n\t" + + /* Check for AMD */ + "\nTryAMD:\n\t" + "cmpl $0x68747541, %%ebx\n\t" + "jne TryCyrix\n\t" + "cmpl $0x69746e65, %%edx\n\t" + "jne TryCyrix\n\t" + "cmpl $0x444d4163, %%ecx\n" + "jne TryCyrix\n\t" + "jmp AMD\n\t" + + /* Check for Cyrix */ + "\nTryCyrix:\n\t" + "cmpl $0x69727943, %%ebx\n\t" + "jne NotSupported2\n\t" + "cmpl $0x736e4978, %%edx\n\t" + "jne NotSupported3\n\t" + "cmpl $0x64616574, %%ecx\n\t" + "jne NotSupported4\n\t" + /* Drop through to Cyrix... */ + + + /* Cyrix Section */ + /* See if extended CPUID level 80000001 is supported */ + /* The value of CPUID/80000001 for the 6x86MX is undefined + according to the Cyrix CPU Detection Guide (Preliminary + Rev. 1.01 table 1), so we'll check the value of eax for + CPUID/0 to see if standard CPUID level 2 is supported. + According to the table, the only CPU which supports level + 2 is also the only one which supports extended CPUID levels. + */ + "cmpl $0x2, %%eax\n\t" + "jne MMXtest\n\t" /* Use standard CPUID instead */ + + /* Extended CPUID supported (in theory), so get extended + features */ + "movl $0x80000001, %%eax\n\t" + "cpuid\n\t" + "testl $0x00800000, %%eax\n\t" /* Test for MMX */ + "jz NotSupported5\n\t" /* MMX not supported */ + "testl $0x01000000, %%eax\n\t" /* Test for Ext'd MMX */ + "jnz EMMXSupported\n\t" + "movl $1, %0:\n\n\t" /* MMX Supported */ + "jmp Return\n\n" + "EMMXSupported:\n\t" + "movl $3, %0:\n\n\t" /* EMMX and MMX Supported */ + "jmp Return\n\t" + + + /* AMD Section */ + "AMD:\n\t" + + /* See if extended CPUID is supported */ + "movl $0x80000000, %%eax\n\t" + "cpuid\n\t" + "cmpl $0x80000000, %%eax\n\t" + "jl MMXtest\n\t" /* Use standard CPUID instead */ + + /* Extended CPUID supported, so get extended features */ + "movl $0x80000001, %%eax\n\t" + "cpuid\n\t" + "testl $0x00800000, %%edx\n\t" /* Test for MMX */ + "jz NotSupported6\n\t" /* MMX not supported */ + "testl $0x80000000, %%edx\n\t" /* Test for 3DNow! */ + "jnz ThreeDNowSupported\n\t" + "movl $1, %0:\n\n\t" /* MMX Supported */ + "jmp Return\n\n" + "ThreeDNowSupported:\n\t" + "movl $5, %0:\n\n\t" /* 3DNow! and MMX Supported */ + "jmp Return\n\t" + + + /* Intel Section */ + "Intel:\n\t" + + /* Check for SSE */ + "SSEtest:\n\t" + "movl $1, %%eax\n\t" + "cpuid\n\t" + "testl $0x02000000, %%edx\n\t" /* Test for SSE */ + "jz MMXtest\n\t" /* SSE Not supported */ + "movl $9, %0:\n\n\t" /* SSE Supported */ + "jmp Return\n\t" + + /* Check for MMX */ + "MMXtest:\n\t" + "movl $1, %%eax\n\t" + "cpuid\n\t" + "testl $0x00800000, %%edx\n\t" /* Test for MMX */ + "jz NotSupported7\n\t" /* MMX Not supported */ + "movl $1, %0:\n\n\t" /* MMX Supported */ + "jmp Return\n\t" + + /* Nothing supported */ + "\nNotSupported1:\n\t" + "#movl $101, %0:\n\n\t" + "\nNotSupported2:\n\t" + "#movl $102, %0:\n\n\t" + "\nNotSupported3:\n\t" + "#movl $103, %0:\n\n\t" + "\nNotSupported4:\n\t" + "#movl $104, %0:\n\n\t" + "\nNotSupported5:\n\t" + "#movl $105, %0:\n\n\t" + "\nNotSupported6:\n\t" + "#movl $106, %0:\n\n\t" + "\nNotSupported7:\n\t" + "#movl $107, %0:\n\n\t" + "movl $0, %0:\n\n\t" + + "Return:\n\t" + : "=a" (rval) + : /* no input */ + : "eax", "ebx", "ecx", "edx" + ); + + /* Return */ + return(rval); +} + +/* Function to test if sse instructions are supported... +*/ +inline extern int +sse_ok(void) +{ + /* Returns 1 if SSE instructions are supported, 0 otherwise */ + return ( (mm_support() & 0x8) >> 3 ); +} +#endif + + + +/* Helper functions for the instruction macros that follow... + (note that memory-to-register, m2r, instructions are nearly + as efficient as register-to-register, r2r, instructions; + however, memory-to-memory instructions are really simulated + as a convenience, and are only 1/3 as efficient) +*/ +#ifdef SSE_TRACE + +/* Include the stuff for printing a trace to stderr... +*/ + +#include <stdio.h> + +#define sse_i2r(op, imm, reg) \ + { \ + sse_t sse_trace; \ + sse_trace.uq = (imm); \ + fprintf(stderr, #op "_i2r(" #imm "=0x%08x%08x, ", \ + sse_trace.d[1], sse_trace.d[0]); \ + __asm__ __volatile__ ("movq %%" #reg ", %0" \ + : "=X" (sse_trace) \ + : /* nothing */ ); \ + fprintf(stderr, #reg "=0x%08x%08x) => ", \ + sse_trace.d[1], sse_trace.d[0]); \ + __asm__ __volatile__ (#op " %0, %%" #reg \ + : /* nothing */ \ + : "X" (imm)); \ + __asm__ __volatile__ ("movq %%" #reg ", %0" \ + : "=X" (sse_trace) \ + : /* nothing */ ); \ + fprintf(stderr, #reg "=0x%08x%08x\n", \ + sse_trace.d[1], sse_trace.d[0]); \ + } + +#define sse_m2r(op, mem, reg) \ + { \ + sse_t sse_trace; \ + sse_trace = (mem); \ + fprintf(stderr, #op "_m2r(" #mem "=0x%08x%08x, ", \ + sse_trace.d[1], sse_trace.d[0]); \ + __asm__ __volatile__ ("movq %%" #reg ", %0" \ + : "=X" (sse_trace) \ + : /* nothing */ ); \ + fprintf(stderr, #reg "=0x%08x%08x) => ", \ + sse_trace.d[1], sse_trace.d[0]); \ + __asm__ __volatile__ (#op " %0, %%" #reg \ + : /* nothing */ \ + : "X" (mem)); \ + __asm__ __volatile__ ("movq %%" #reg ", %0" \ + : "=X" (sse_trace) \ + : /* nothing */ ); \ + fprintf(stderr, #reg "=0x%08x%08x\n", \ + sse_trace.d[1], sse_trace.d[0]); \ + } + +#define sse_r2m(op, reg, mem) \ + { \ + sse_t sse_trace; \ + __asm__ __volatile__ ("movq %%" #reg ", %0" \ + : "=X" (sse_trace) \ + : /* nothing */ ); \ + fprintf(stderr, #op "_r2m(" #reg "=0x%08x%08x, ", \ + sse_trace.d[1], sse_trace.d[0]); \ + sse_trace = (mem); \ + fprintf(stderr, #mem "=0x%08x%08x) => ", \ + sse_trace.d[1], sse_trace.d[0]); \ + __asm__ __volatile__ (#op " %%" #reg ", %0" \ + : "=X" (mem) \ + : /* nothing */ ); \ + sse_trace = (mem); \ + fprintf(stderr, #mem "=0x%08x%08x\n", \ + sse_trace.d[1], sse_trace.d[0]); \ + } + +#define sse_r2r(op, regs, regd) \ + { \ + sse_t sse_trace; \ + __asm__ __volatile__ ("movq %%" #regs ", %0" \ + : "=X" (sse_trace) \ + : /* nothing */ ); \ + fprintf(stderr, #op "_r2r(" #regs "=0x%08x%08x, ", \ + sse_trace.d[1], sse_trace.d[0]); \ + __asm__ __volatile__ ("movq %%" #regd ", %0" \ + : "=X" (sse_trace) \ + : /* nothing */ ); \ + fprintf(stderr, #regd "=0x%08x%08x) => ", \ + sse_trace.d[1], sse_trace.d[0]); \ + __asm__ __volatile__ (#op " %" #regs ", %" #regd); \ + __asm__ __volatile__ ("movq %%" #regd ", %0" \ + : "=X" (sse_trace) \ + : /* nothing */ ); \ + fprintf(stderr, #regd "=0x%08x%08x\n", \ + sse_trace.d[1], sse_trace.d[0]); \ + } + +#define sse_m2m(op, mems, memd) \ + { \ + sse_t sse_trace; \ + sse_trace = (mems); \ + fprintf(stderr, #op "_m2m(" #mems "=0x%08x%08x, ", \ + sse_trace.d[1], sse_trace.d[0]); \ + sse_trace = (memd); \ + fprintf(stderr, #memd "=0x%08x%08x) => ", \ + sse_trace.d[1], sse_trace.d[0]); \ + __asm__ __volatile__ ("movq %0, %%mm0\n\t" \ + #op " %1, %%mm0\n\t" \ + "movq %%mm0, %0" \ + : "=X" (memd) \ + : "X" (mems)); \ + sse_trace = (memd); \ + fprintf(stderr, #memd "=0x%08x%08x\n", \ + sse_trace.d[1], sse_trace.d[0]); \ + } + +#else + +/* These macros are a lot simpler without the tracing... +*/ + +#define sse_i2r(op, imm, reg) \ + __asm__ __volatile__ (#op " %0, %%" #reg \ + : /* nothing */ \ + : "X" (imm) ) + +#define sse_m2r(op, mem, reg) \ + __asm__ __volatile__ (#op " %0, %%" #reg \ + : /* nothing */ \ + : "X" (mem)) + +#define sse_r2m(op, reg, mem) \ + __asm__ __volatile__ (#op " %%" #reg ", %0" \ + : "=X" (mem) \ + : /* nothing */ ) + +#define sse_r2r(op, regs, regd) \ + __asm__ __volatile__ (#op " %" #regs ", %" #regd) + +#define sse_r2ri(op, regs, regd, imm) \ + __asm__ __volatile__ (#op " %0, %%" #regs ", %%" #regd \ + : /* nothing */ \ + : "X" (imm) ) + +/* Load data from mems to xmmreg, operate on xmmreg, and store data to memd */ +#define sse_m2m(op, mems, memd, xmmreg) \ + __asm__ __volatile__ ("movups %0, %%xmm0\n\t" \ + #op " %1, %%xmm0\n\t" \ + "movups %%mm0, %0" \ + : "=X" (memd) \ + : "X" (mems)) + +#define sse_m2ri(op, mem, reg, subop) \ + __asm__ __volatile__ (#op " %0, %%" #reg ", " #subop \ + : /* nothing */ \ + : "X" (mem)) + +#define sse_m2mi(op, mems, memd, xmmreg, subop) \ + __asm__ __volatile__ ("movups %0, %%xmm0\n\t" \ + #op " %1, %%xmm0, " #subop "\n\t" \ + "movups %%mm0, %0" \ + : "=X" (memd) \ + : "X" (mems)) +#endif + + + + +/* 1x128 MOVe Aligned four Packed Single-fp +*/ +#define movaps_m2r(var, reg) sse_m2r(movaps, var, reg) +#define movaps_r2m(reg, var) sse_r2m(movaps, reg, var) +#define movaps_r2r(regs, regd) sse_r2r(movaps, regs, regd) +#define movaps(vars, vard) \ + __asm__ __volatile__ ("movaps %1, %%mm0\n\t" \ + "movaps %%mm0, %0" \ + : "=X" (vard) \ + : "X" (vars)) + + +/* 1x128 MOVe aligned Non-Temporal four Packed Single-fp +*/ +#define movntps_r2m(xmmreg, var) sse_r2m(movntps, xmmreg, var) + + +/* 1x64 MOVe Non-Temporal Quadword +*/ +#define movntq_r2m(mmreg, var) sse_r2m(movntq, mmreg, var) + + +/* 1x128 MOVe Unaligned four Packed Single-fp +*/ +#define movups_m2r(var, reg) sse_m2r(movups, var, reg) +#define movups_r2m(reg, var) sse_r2m(movups, reg, var) +#define movups_r2r(regs, regd) sse_r2r(movups, regs, regd) +#define movups(vars, vard) \ + __asm__ __volatile__ ("movups %1, %%mm0\n\t" \ + "movups %%mm0, %0" \ + : "=X" (vard) \ + : "X" (vars)) + + +/* MOVe High to Low Packed Single-fp + high half of 4x32f (x) -> low half of 4x32f (y) +*/ +#define movhlps_r2r(regs, regd) sse_r2r(movhlps, regs, regd) + + +/* MOVe Low to High Packed Single-fp + low half of 4x32f (x) -> high half of 4x32f (y) +*/ +#define movlhps_r2r(regs, regd) sse_r2r(movlhps, regs, regd) + + +/* MOVe High Packed Single-fp + 2x32f -> high half of 4x32f +*/ +#define movhps_m2r(var, reg) sse_m2r(movhps, var, reg) +#define movhps_r2m(reg, var) sse_r2m(movhps, reg, var) +#define movhps(vars, vard) \ + __asm__ __volatile__ ("movhps %1, %%mm0\n\t" \ + "movhps %%mm0, %0" \ + : "=X" (vard) \ + : "X" (vars)) + + +/* MOVe Low Packed Single-fp + 2x32f -> low half of 4x32f +*/ +#define movlps_m2r(var, reg) sse_m2r(movlps, var, reg) +#define movlps_r2m(reg, var) sse_r2m(movlps, reg, var) +#define movlps(vars, vard) \ + __asm__ __volatile__ ("movlps %1, %%mm0\n\t" \ + "movlps %%mm0, %0" \ + : "=X" (vard) \ + : "X" (vars)) + + +/* MOVe Scalar Single-fp + lowest field of 4x32f (x) -> lowest field of 4x32f (y) +*/ +#define movss_m2r(var, reg) sse_m2r(movss, var, reg) +#define movss_r2m(reg, var) sse_r2m(movss, reg, var) +#define movss_r2r(regs, regd) sse_r2r(movss, regs, regd) +#define movss(vars, vard) \ + __asm__ __volatile__ ("movss %1, %%mm0\n\t" \ + "movss %%mm0, %0" \ + : "=X" (vard) \ + : "X" (vars)) + + +/* 4x16 Packed SHUFfle Word +*/ +#define pshufw_m2r(var, reg, index) sse_m2ri(pshufw, var, reg, index) +#define pshufw_r2r(regs, regd, index) sse_r2ri(pshufw, regs, regd, index) + + +/* 1x128 SHUFfle Packed Single-fp +*/ +#define shufps_m2r(var, reg, index) sse_m2ri(shufps, var, reg, index) +#define shufps_r2r(regs, regd, index) sse_r2ri(shufps, regs, regd, index) + + +/* ConVerT Packed signed Int32 to(2) Packed Single-fp +*/ +#define cvtpi2ps_m2r(var, xmmreg) sse_m2r(cvtpi2ps, var, xmmreg) +#define cvtpi2ps_r2r(mmreg, xmmreg) sse_r2r(cvtpi2ps, mmreg, xmmreg) + + +/* ConVerT Packed Single-fp to(2) Packed signed Int32 +*/ +#define cvtps2pi_m2r(var, mmreg) sse_m2r(cvtps2pi, var, mmreg) +#define cvtps2pi_r2r(xmmreg, mmreg) sse_r2r(cvtps2pi, mmreg, xmmreg) + + +/* ConVerT with Truncate Packed Single-fp to(2) Packed Int32 +*/ +#define cvttps2pi_m2r(var, mmreg) sse_m2r(cvttps2pi, var, mmreg) +#define cvttps2pi_r2r(xmmreg, mmreg) sse_r2r(cvttps2pi, mmreg, xmmreg) + + +/* ConVerT Signed Int32 to(2) Single-fp (Scalar) +*/ +#define cvtsi2ss_m2r(var, xmmreg) sse_m2r(cvtsi2ss, var, xmmreg) +#define cvtsi2ss_r2r(reg, xmmreg) sse_r2r(cvtsi2ss, reg, xmmreg) + + +/* ConVerT Scalar Single-fp to(2) Signed Int32 +*/ +#define cvtss2si_m2r(var, reg) sse_m2r(cvtss2si, var, reg) +#define cvtss2si_r2r(xmmreg, reg) sse_r2r(cvtss2si, xmmreg, reg) + + +/* ConVerT with Truncate Scalar Single-fp to(2) Signed Int32 +*/ +#define cvttss2si_m2r(var, reg) sse_m2r(cvtss2si, var, reg) +#define cvttss2si_r2r(xmmreg, reg) sse_r2r(cvtss2si, xmmreg, reg) + + +/* Parallel EXTRact Word from 4x16 +*/ +#define pextrw_r2r(mmreg, reg, field) sse_r2ri(pextrw, mmreg, reg, field) + + +/* Parallel INSeRt Word from 4x16 +*/ +#define pinsrw_r2r(reg, mmreg, field) sse_r2ri(pinsrw, reg, mmreg, field) + + + +/* MOVe MaSK from Packed Single-fp +*/ +#ifdef SSE_TRACE + #define movmskps(xmmreg, reg) \ + { \ + fprintf(stderr, "movmskps()\n"); \ + __asm__ __volatile__ ("movmskps %" #xmmreg ", %" #reg) \ + } +#else + #define movmskps(xmmreg, reg) \ + __asm__ __volatile__ ("movmskps %" #xmmreg ", %" #reg) +#endif + + +/* Parallel MOVe MaSK from mmx reg to 32-bit reg +*/ +#ifdef SSE_TRACE + #define pmovmskb(mmreg, reg) \ + { \ + fprintf(stderr, "movmskps()\n"); \ + __asm__ __volatile__ ("movmskps %" #mmreg ", %" #reg) \ + } +#else + #define pmovmskb(mmreg, reg) \ + __asm__ __volatile__ ("movmskps %" #mmreg ", %" #reg) +#endif + + +/* MASKed MOVe from 8x8 to memory pointed to by (e)di register +*/ +#define maskmovq(mmregs, fieldreg) sse_r2ri(maskmovq, mmregs, fieldreg) + + + + +/* 4x32f Parallel ADDs +*/ +#define addps_m2r(var, reg) sse_m2r(addps, var, reg) +#define addps_r2r(regs, regd) sse_r2r(addps, regs, regd) +#define addps(vars, vard, xmmreg) sse_m2m(addps, vars, vard, xmmreg) + + +/* Lowest Field of 4x32f Parallel ADDs +*/ +#define addss_m2r(var, reg) sse_m2r(addss, var, reg) +#define addss_r2r(regs, regd) sse_r2r(addss, regs, regd) +#define addss(vars, vard, xmmreg) sse_m2m(addss, vars, vard, xmmreg) + + +/* 4x32f Parallel SUBs +*/ +#define subps_m2r(var, reg) sse_m2r(subps, var, reg) +#define subps_r2r(regs, regd) sse_r2r(subps, regs, regd) +#define subps(vars, vard, xmmreg) sse_m2m(subps, vars, vard, xmmreg) + + +/* Lowest Field of 4x32f Parallel SUBs +*/ +#define subss_m2r(var, reg) sse_m2r(subss, var, reg) +#define subss_r2r(regs, regd) sse_r2r(subss, regs, regd) +#define subss(vars, vard, xmmreg) sse_m2m(subss, vars, vard, xmmreg) + + +/* 8x8u -> 4x16u Packed Sum of Absolute Differences +*/ +#define psadbw_m2r(var, reg) sse_m2r(psadbw, var, reg) +#define psadbw_r2r(regs, regd) sse_r2r(psadbw, regs, regd) +#define psadbw(vars, vard, mmreg) sse_m2m(psadbw, vars, vard, mmreg) + + +/* 4x16u Parallel MUL High Unsigned +*/ +#define pmulhuw_m2r(var, reg) sse_m2r(pmulhuw, var, reg) +#define pmulhuw_r2r(regs, regd) sse_r2r(pmulhuw, regs, regd) +#define pmulhuw(vars, vard, mmreg) sse_m2m(pmulhuw, vars, vard, mmreg) + + +/* 4x32f Parallel MULs +*/ +#define mulps_m2r(var, reg) sse_m2r(mulps, var, reg) +#define mulps_r2r(regs, regd) sse_r2r(mulps, regs, regd) +#define mulps(vars, vard, xmmreg) sse_m2m(mulps, vars, vard, xmmreg) + + +/* Lowest Field of 4x32f Parallel MULs +*/ +#define mulss_m2r(var, reg) sse_m2r(mulss, var, reg) +#define mulss_r2r(regs, regd) sse_r2r(mulss, regs, regd) +#define mulss(vars, vard, xmmreg) sse_m2m(mulss, vars, vard, xmmreg) + + +/* 4x32f Parallel DIVs +*/ +#define divps_m2r(var, reg) sse_m2r(divps, var, reg) +#define divps_r2r(regs, regd) sse_r2r(divps, regs, regd) +#define divps(vars, vard, xmmreg) sse_m2m(divps, vars, vard, xmmreg) + + +/* Lowest Field of 4x32f Parallel DIVs +*/ +#define divss_m2r(var, reg) sse_m2r(divss, var, reg) +#define divss_r2r(regs, regd) sse_r2r(divss, regs, regd) +#define divss(vars, vard, xmmreg) sse_m2m(divss, vars, vard, xmmreg) + + +/* 4x32f Parallel Reciprocals +*/ +#define rcpps_m2r(var, reg) sse_m2r(rcpps, var, reg) +#define rcpps_r2r(regs, regd) sse_r2r(rcpps, regs, regd) +#define rcpps(vars, vard, xmmreg) sse_m2m(rcpps, vars, vard, xmmreg) + + +/* Lowest Field of 4x32f Parallel Reciprocals +*/ +#define rcpss_m2r(var, reg) sse_m2r(rcpss, var, reg) +#define rcpss_r2r(regs, regd) sse_r2r(rcpss, regs, regd) +#define rcpss(vars, vard, xmmreg) sse_m2m(rcpss, vars, vard, xmmreg) + + +/* 4x32f Parallel Square Root of Reciprocals +*/ +#define rsqrtps_m2r(var, reg) sse_m2r(rsqrtps, var, reg) +#define rsqrtps_r2r(regs, regd) sse_r2r(rsqrtps, regs, regd) +#define rsqrtps(vars, vard, xmmreg) sse_m2m(rsqrtps, vars, vard, xmmreg) + + +/* Lowest Field of 4x32f Parallel Square Root of Reciprocals +*/ +#define rsqrtss_m2r(var, reg) sse_m2r(rsqrtss, var, reg) +#define rsqrtss_r2r(regs, regd) sse_r2r(rsqrtss, regs, regd) +#define rsqrtss(vars, vard, xmmreg) sse_m2m(rsqrtss, vars, vard, xmmreg) + + +/* 4x32f Parallel Square Roots +*/ +#define sqrtps_m2r(var, reg) sse_m2r(sqrtps, var, reg) +#define sqrtps_r2r(regs, regd) sse_r2r(sqrtps, regs, regd) +#define sqrtps(vars, vard, xmmreg) sse_m2m(sqrtps, vars, vard, xmmreg) + + +/* Lowest Field of 4x32f Parallel Square Roots +*/ +#define sqrtss_m2r(var, reg) sse_m2r(sqrtss, var, reg) +#define sqrtss_r2r(regs, regd) sse_r2r(sqrtss, regs, regd) +#define sqrtss(vars, vard, xmmreg) sse_m2m(sqrtss, vars, vard, xmmreg) + + +/* 8x8u and 4x16u Parallel AVeraGe +*/ +#define pavgb_m2r(var, reg) sse_m2r(pavgb, var, reg) +#define pavgb_r2r(regs, regd) sse_r2r(pavgb, regs, regd) +#define pavgb(vars, vard, mmreg) sse_m2m(pavgb, vars, vard, mmreg) + +#define pavgw_m2r(var, reg) sse_m2r(pavgw, var, reg) +#define pavgw_r2r(regs, regd) sse_r2r(pavgw, regs, regd) +#define pavgw(vars, vard, mmreg) sse_m2m(pavgw, vars, vard, mmreg) + + +/* 1x128 bitwise AND +*/ +#define andps_m2r(var, reg) sse_m2r(andps, var, reg) +#define andps_r2r(regs, regd) sse_r2r(andps, regs, regd) +#define andps(vars, vard, xmmreg) sse_m2m(andps, vars, vard, xmmreg) + + +/* 1x128 bitwise AND with Not the destination +*/ +#define andnps_m2r(var, reg) sse_m2r(andnps, var, reg) +#define andnps_r2r(regs, regd) sse_r2r(andnps, regs, regd) +#define andnps(vars, vard, xmmreg) sse_m2m(andnps, vars, vard, xmmreg) + + +/* 1x128 bitwise OR +*/ +#define orps_m2r(var, reg) sse_m2r(orps, var, reg) +#define orps_r2r(regs, regd) sse_r2r(orps, regs, regd) +#define orps(vars, vard, xmmreg) sse_m2m(orps, vars, vard, xmmreg) + + +/* 1x128 bitwise eXclusive OR +*/ +#define xorps_m2r(var, reg) sse_m2r(xorps, var, reg) +#define xorps_r2r(regs, regd) sse_r2r(xorps, regs, regd) +#define xorps(vars, vard, xmmreg) sse_m2m(xorps, vars, vard, xmmreg) + + +/* 8x8u, 4x16, and 4x32f Parallel Maximum +*/ +#define pmaxub_m2r(var, reg) sse_m2r(pmaxub, var, reg) +#define pmaxub_r2r(regs, regd) sse_r2r(pmaxub, regs, regd) +#define pmaxub(vars, vard, mmreg) sse_m2m(pmaxub, vars, vard, mmreg) + +#define pmaxsw_m2r(var, reg) sse_m2r(pmaxsw, var, reg) +#define pmaxsw_r2r(regs, regd) sse_r2r(pmaxsw, regs, regd) +#define pmaxsw(vars, vard, mmreg) sse_m2m(pmaxsw, vars, vard, mmreg) + +#define maxps_m2r(var, reg) sse_m2r(maxps, var, reg) +#define maxps_r2r(regs, regd) sse_r2r(maxps, regs, regd) +#define maxps(vars, vard, xmmreg) sse_m2m(maxps, vars, vard, xmmreg) + + +/* Lowest Field of 4x32f Parallel Maximum +*/ +#define maxss_m2r(var, reg) sse_m2r(maxss, var, reg) +#define maxss_r2r(regs, regd) sse_r2r(maxss, regs, regd) +#define maxss(vars, vard, xmmreg) sse_m2m(maxss, vars, vard, xmmreg) + + +/* 8x8u, 4x16, and 4x32f Parallel Minimum +*/ +#define pminub_m2r(var, reg) sse_m2r(pminub, var, reg) +#define pminub_r2r(regs, regd) sse_r2r(pminub, regs, regd) +#define pminub(vars, vard, mmreg) sse_m2m(pminub, vars, vard, mmreg) + +#define pminsw_m2r(var, reg) sse_m2r(pminsw, var, reg) +#define pminsw_r2r(regs, regd) sse_r2r(pminsw, regs, regd) +#define pminsw(vars, vard, mmreg) sse_m2m(pminsw, vars, vard, mmreg) + +#define minps_m2r(var, reg) sse_m2r(minps, var, reg) +#define minps_r2r(regs, regd) sse_r2r(minps, regs, regd) +#define minps(vars, vard, xmmreg) sse_m2m(minps, vars, vard, xmmreg) + + +/* Lowest Field of 4x32f Parallel Minimum +*/ +#define minss_m2r(var, reg) sse_m2r(minss, var, reg) +#define minss_r2r(regs, regd) sse_r2r(minss, regs, regd) +#define minss(vars, vard, xmmreg) sse_m2m(minss, vars, vard, xmmreg) + + +/* 4x32f Parallel CoMPares + (resulting fields are either 0 or -1) +*/ +#define cmpps_m2r(var, reg, op) sse_m2ri(cmpps, var, reg, op) +#define cmpps_r2r(regs, regd, op) sse_r2ri(cmpps, regs, regd, op) +#define cmpps(vars, vard, op, xmmreg) sse_m2mi(cmpps, vars, vard, xmmreg, op) + +#define cmpeqps_m2r(var, reg) sse_m2ri(cmpps, var, reg, 0) +#define cmpeqps_r2r(regs, regd) sse_r2ri(cmpps, regs, regd, 0) +#define cmpeqps(vars, vard, xmmreg) sse_m2mi(cmpps, vars, vard, xmmreg, 0) + +#define cmpltps_m2r(var, reg) sse_m2ri(cmpps, var, reg, 1) +#define cmpltps_r2r(regs, regd) sse_r2ri(cmpps, regs, regd, 1) +#define cmpltps(vars, vard, xmmreg) sse_m2mi(cmpps, vars, vard, xmmreg, 1) + +#define cmpleps_m2r(var, reg) sse_m2ri(cmpps, var, reg, 2) +#define cmpleps_r2r(regs, regd) sse_r2ri(cmpps, regs, regd, 2) +#define cmpleps(vars, vard, xmmreg) sse_m2mi(cmpps, vars, vard, xmmreg, 2) + +#define cmpunordps_m2r(var, reg) sse_m2ri(cmpps, var, reg, 3) +#define cmpunordps_r2r(regs, regd) sse_r2ri(cmpps, regs, regd, 3) +#define cmpunordps(vars, vard, xmmreg) sse_m2mi(cmpps, vars, vard, xmmreg, 3) + +#define cmpneqps_m2r(var, reg) sse_m2ri(cmpps, var, reg, 4) +#define cmpneqps_r2r(regs, regd) sse_r2ri(cmpps, regs, regd, 4) +#define cmpneqps(vars, vard, xmmreg) sse_m2mi(cmpps, vars, vard, xmmreg, 4) + +#define cmpnltps_m2r(var, reg) sse_m2ri(cmpps, var, reg, 5) +#define cmpnltps_r2r(regs, regd) sse_r2ri(cmpps, regs, regd, 5) +#define cmpnltps(vars, vard, xmmreg) sse_m2mi(cmpps, vars, vard, xmmreg, 5) + +#define cmpnleps_m2r(var, reg) sse_m2ri(cmpps, var, reg, 6) +#define cmpnleps_r2r(regs, regd) sse_r2ri(cmpps, regs, regd, 6) +#define cmpnleps(vars, vard, xmmreg) sse_m2mi(cmpps, vars, vard, xmmreg, 6) + +#define cmpordps_m2r(var, reg) sse_m2ri(cmpps, var, reg, 7) +#define cmpordps_r2r(regs, regd) sse_r2ri(cmpps, regs, regd, 7) +#define cmpordps(vars, vard, xmmreg) sse_m2mi(cmpps, vars, vard, xmmreg, 7) + + +/* Lowest Field of 4x32f Parallel CoMPares + (resulting fields are either 0 or -1) +*/ +#define cmpss_m2r(var, reg, op) sse_m2ri(cmpss, var, reg, op) +#define cmpss_r2r(regs, regd, op) sse_r2ri(cmpss, regs, regd, op) +#define cmpss(vars, vard, op, xmmreg) sse_m2mi(cmpss, vars, vard, xmmreg, op) + +#define cmpeqss_m2r(var, reg) sse_m2ri(cmpss, var, reg, 0) +#define cmpeqss_r2r(regs, regd) sse_r2ri(cmpss, regs, regd, 0) +#define cmpeqss(vars, vard, xmmreg) sse_m2mi(cmpss, vars, vard, xmmreg, 0) + +#define cmpltss_m2r(var, reg) sse_m2ri(cmpss, var, reg, 1) +#define cmpltss_r2r(regs, regd) sse_r2ri(cmpss, regs, regd, 1) +#define cmpltss(vars, vard, xmmreg) sse_m2mi(cmpss, vars, vard, xmmreg, 1) + +#define cmpless_m2r(var, reg) sse_m2ri(cmpss, var, reg, 2) +#define cmpless_r2r(regs, regd) sse_r2ri(cmpss, regs, regd, 2) +#define cmpless(vars, vard, xmmreg) sse_m2mi(cmpss, vars, vard, xmmreg, 2) + +#define cmpunordss_m2r(var, reg) sse_m2ri(cmpss, var, reg, 3) +#define cmpunordss_r2r(regs, regd) sse_r2ri(cmpss, regs, regd, 3) +#define cmpunordss(vars, vard, xmmreg) sse_m2mi(cmpss, vars, vard, xmmreg, 3) + +#define cmpneqss_m2r(var, reg) sse_m2ri(cmpss, var, reg, 4) +#define cmpneqss_r2r(regs, regd) sse_r2ri(cmpss, regs, regd, 4) +#define cmpneqss(vars, vard, xmmreg) sse_m2mi(cmpss, vars, vard, xmmreg, 4) + +#define cmpnltss_m2r(var, reg) sse_m2ri(cmpss, var, reg, 5) +#define cmpnltss_r2r(regs, regd) sse_r2ri(cmpss, regs, regd, 5) +#define cmpnltss(vars, vard, xmmreg) sse_m2mi(cmpss, vars, vard, xmmreg, 5) + +#define cmpnless_m2r(var, reg) sse_m2ri(cmpss, var, reg, 6) +#define cmpnless_r2r(regs, regd) sse_r2ri(cmpss, regs, regd, 6) +#define cmpnless(vars, vard, xmmreg) sse_m2mi(cmpss, vars, vard, xmmreg, 6) + +#define cmpordss_m2r(var, reg) sse_m2ri(cmpss, var, reg, 7) +#define cmpordss_r2r(regs, regd) sse_r2ri(cmpss, regs, regd, 7) +#define cmpordss(vars, vard, xmmreg) sse_m2mi(cmpss, vars, vard, xmmreg, 7) + + +/* Lowest Field of 4x32f Parallel CoMPares to set EFLAGS + (resulting fields are either 0 or -1) +*/ +#define comiss_m2r(var, reg) sse_m2r(comiss, var, reg) +#define comiss_r2r(regs, regd) sse_r2r(comiss, regs, regd) +#define comiss(vars, vard, xmmreg) sse_m2m(comiss, vars, vard, xmmreg) + + +/* Lowest Field of 4x32f Unordered Parallel CoMPares to set EFLAGS + (resulting fields are either 0 or -1) +*/ +#define ucomiss_m2r(var, reg) sse_m2r(ucomiss, var, reg) +#define ucomiss_r2r(regs, regd) sse_r2r(ucomiss, regs, regd) +#define ucomiss(vars, vard, xmmreg) sse_m2m(ucomiss, vars, vard, xmmreg) + + +/* 2-(4x32f) -> 4x32f UNPaCK Low Packed Single-fp + (interleaves low half of dest with low half of source + as padding in each result field) +*/ +#define unpcklps_m2r(var, reg) sse_m2r(unpcklps, var, reg) +#define unpcklps_r2r(regs, regd) sse_r2r(unpcklps, regs, regd) + + +/* 2-(4x32f) -> 4x32f UNPaCK High Packed Single-fp + (interleaves high half of dest with high half of source + as padding in each result field) +*/ +#define unpckhps_m2r(var, reg) sse_m2r(unpckhps, var, reg) +#define unpckhps_r2r(regs, regd) sse_r2r(unpckhps, regs, regd) + + + +/* Fp and mmX ReSTORe state +*/ +#ifdef SSE_TRACE + #define fxrstor(mem) \ + { \ + fprintf(stderr, "fxrstor()\n"); \ + __asm__ __volatile__ ("fxrstor %0" \ + : /* nothing */ \ + : "X" (mem)) \ + } +#else + #define fxrstor(mem) \ + __asm__ __volatile__ ("fxrstor %0" \ + : /* nothing */ \ + : "X" (mem)) +#endif + + +/* Fp and mmX SAVE state +*/ +#ifdef SSE_TRACE + #define fxsave(mem) \ + { \ + fprintf(stderr, "fxsave()\n"); \ + __asm__ __volatile__ ("fxsave %0" \ + : /* nothing */ \ + : "X" (mem)) \ + } +#else + #define fxsave(mem) \ + __asm__ __volatile__ ("fxsave %0" \ + : /* nothing */ \ + : "X" (mem)) +#endif + + +/* STore streaMing simd eXtensions Control/Status Register +*/ +#ifdef SSE_TRACE + #define stmxcsr(mem) \ + { \ + fprintf(stderr, "stmxcsr()\n"); \ + __asm__ __volatile__ ("stmxcsr %0" \ + : /* nothing */ \ + : "X" (mem)) \ + } +#else + #define stmxcsr(mem) \ + __asm__ __volatile__ ("stmxcsr %0" \ + : /* nothing */ \ + : "X" (mem)) +#endif + + +/* LoaD streaMing simd eXtensions Control/Status Register +*/ +#ifdef SSE_TRACE + #define ldmxcsr(mem) \ + { \ + fprintf(stderr, "ldmxcsr()\n"); \ + __asm__ __volatile__ ("ldmxcsr %0" \ + : /* nothing */ \ + : "X" (mem)) \ + } +#else + #define ldmxcsr(mem) \ + __asm__ __volatile__ ("ldmxcsr %0" \ + : /* nothing */ \ + : "X" (mem)) +#endif + + +/* Store FENCE - enforce ordering of stores before fence vs. stores + occuring after fence in source code. +*/ +#ifdef SSE_TRACE + #define sfence() \ + { \ + fprintf(stderr, "sfence()\n"); \ + __asm__ __volatile__ ("sfence\n\t") \ + } +#else + #define sfence() \ + __asm__ __volatile__ ("sfence\n\t") +#endif + + +/* PREFETCH data using T0, T1, T2, or NTA hint + T0 = Prefetch into all cache levels + T1 = Prefetch into all cache levels except 0th level + T2 = Prefetch into all cache levels except 0th and 1st levels + NTA = Prefetch data into non-temporal cache structure +*/ +#ifdef SSE_TRACE +#else + #define prefetch(mem, hint) \ + __asm__ __volatile__ ("prefetch" #hint " %0" \ + : /* nothing */ \ + : "X" (mem)) + + #define prefetcht0(mem) prefetch(mem, t0) + #define prefetcht1(mem) prefetch(mem, t1) + #define prefetcht2(mem) prefetch(mem, t2) + #define prefetchnta(mem) prefetch(mem, nta) +#endif + + + +#endif diff --git a/gst/deinterlace/tvtime/tomsmocomp.c b/gst/deinterlace/tvtime/tomsmocomp.c new file mode 100644 index 00000000..3141fbac --- /dev/null +++ b/gst/deinterlace/tvtime/tomsmocomp.c @@ -0,0 +1,211 @@ +/** + * Copyright (C) 2004 Billy Biggs <vektor@dumbterm.net> + * Copyright (C) 2008 Sebastian Dröge <slomo@collabora.co.uk> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software Foundation, + * Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include <stdlib.h> +#include "_stdint.h" +#include <string.h> + +#include "gst/gst.h" +#include "gstdeinterlace.h" +#include "plugins.h" + +#define GST_TYPE_DEINTERLACE_METHOD_TOMSMOCOMP (gst_deinterlace_method_tomsmocomp_get_type ()) +#define GST_IS_DEINTERLACE_METHOD_TOMSMOCOMP(obj) (G_TYPE_CHECK_INSTANCE_TYPE ((obj), GST_TYPE_DEINTERLACE_METHOD_TOMSMOCOMP)) +#define GST_IS_DEINTERLACE_METHOD_TOMSMOCOMP_CLASS(klass) (G_TYPE_CHECK_CLASS_TYPE ((klass), GST_TYPE_DEINTERLACE_METHOD_TOMSMOCOMP)) +#define GST_DEINTERLACE_METHOD_TOMSMOCOMP_GET_CLASS(obj) (G_TYPE_INSTANCE_GET_CLASS ((obj), GST_TYPE_DEINTERLACE_METHOD_TOMSMOCOMP, GstDeinterlaceMethodTomsMoCompClass)) +#define GST_DEINTERLACE_METHOD_TOMSMOCOMP(obj) (G_TYPE_CHECK_INSTANCE_CAST ((obj), GST_TYPE_DEINTERLACE_METHOD_TOMSMOCOMP, GstDeinterlaceMethodTomsMoComp)) +#define GST_DEINTERLACE_METHOD_TOMSMOCOMP_CLASS(klass) (G_TYPE_CHECK_CLASS_CAST ((klass), GST_TYPE_DEINTERLACE_METHOD_TOMSMOCOMP, GstDeinterlaceMethodTomsMoCompClass)) +#define GST_DEINTERLACE_METHOD_TOMSMOCOMP_CAST(obj) ((GstDeinterlaceMethodTomsMoComp*)(obj)) + +GType gst_deinterlace_method_tomsmocomp_get_type (void); + +typedef struct +{ + GstDeinterlaceMethod parent; + + guint search_effort; + gboolean strange_bob; +} GstDeinterlaceMethodTomsMoComp; + +typedef struct +{ + GstDeinterlaceMethodClass parent_class; +} GstDeinterlaceMethodTomsMoCompClass; + +static int +Fieldcopy (void *dest, const void *src, size_t count, + int rows, int dst_pitch, int src_pitch) +{ + unsigned char *pDest = (unsigned char *) dest; + unsigned char *pSrc = (unsigned char *) src; + + int i; + + for (i = 0; i < rows; i++) { + oil_memcpy (pDest, pSrc, count); + pSrc += src_pitch; + pDest += dst_pitch; + } + return 0; +} + +#define USE_FOR_DSCALER + +#define IS_C +#define SIMD_TYPE C +#define FUNCT_NAME tomsmocompDScaler_C +#include "tomsmocomp/TomsMoCompAll.inc" +#undef IS_C +#undef SIMD_TYPE +#undef FUNCT_NAME + +#ifdef BUILD_X86_ASM + +#include "tomsmocomp/tomsmocompmacros.h" +#include "x86-64_macros.inc" + +#define IS_MMX +#define SIMD_TYPE MMX +#define FUNCT_NAME tomsmocompDScaler_MMX +#include "tomsmocomp/TomsMoCompAll.inc" +#undef IS_MMX +#undef SIMD_TYPE +#undef FUNCT_NAME + +#define IS_3DNOW +#define SIMD_TYPE 3DNOW +#define FUNCT_NAME tomsmocompDScaler_3DNOW +#include "tomsmocomp/TomsMoCompAll.inc" +#undef IS_3DNOW +#undef SIMD_TYPE +#undef FUNCT_NAME + +#define IS_MMXEXT +#define SIMD_TYPE MMXEXT +#define FUNCT_NAME tomsmocompDScaler_MMXEXT +#include "tomsmocomp/TomsMoCompAll.inc" +#undef IS_MMXEXT +#undef SIMD_TYPE +#undef FUNCT_NAME + +#endif + +G_DEFINE_TYPE (GstDeinterlaceMethodTomsMoComp, + gst_deinterlace_method_tomsmocomp, GST_TYPE_DEINTERLACE_METHOD); + +enum +{ + ARG_0, + ARG_SEARCH_EFFORT, + ARG_STRANGE_BOB +}; + +static void +gst_deinterlace_method_tomsmocomp_set_property (GObject * object, guint prop_id, + const GValue * value, GParamSpec * pspec) +{ + GstDeinterlaceMethodTomsMoComp *self = + GST_DEINTERLACE_METHOD_TOMSMOCOMP (object); + + switch (prop_id) { + case ARG_SEARCH_EFFORT: + self->search_effort = g_value_get_uint (value); + break; + case ARG_STRANGE_BOB: + self->strange_bob = g_value_get_boolean (value); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID (object, prop_id, pspec); + } +} + +static void +gst_deinterlace_method_tomsmocomp_get_property (GObject * object, guint prop_id, + GValue * value, GParamSpec * pspec) +{ + GstDeinterlaceMethodTomsMoComp *self = + GST_DEINTERLACE_METHOD_TOMSMOCOMP (object); + + switch (prop_id) { + case ARG_SEARCH_EFFORT: + g_value_set_uint (value, self->search_effort); + break; + case ARG_STRANGE_BOB: + g_value_set_boolean (value, self->strange_bob); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID (object, prop_id, pspec); + } +} + +static void + gst_deinterlace_method_tomsmocomp_class_init + (GstDeinterlaceMethodTomsMoCompClass * klass) +{ + GstDeinterlaceMethodClass *dim_class = (GstDeinterlaceMethodClass *) klass; + GObjectClass *gobject_class = (GObjectClass *) klass; +#ifdef BUILD_X86_ASM + guint cpu_flags = oil_cpu_get_flags (); +#endif + + gobject_class->set_property = gst_deinterlace_method_tomsmocomp_set_property; + gobject_class->get_property = gst_deinterlace_method_tomsmocomp_get_property; + + g_object_class_install_property (gobject_class, ARG_SEARCH_EFFORT, + g_param_spec_uint ("search-effort", + "Search Effort", + "Search Effort", 0, 27, 5, G_PARAM_READWRITE | G_PARAM_STATIC_STRINGS) + ); + + g_object_class_install_property (gobject_class, ARG_STRANGE_BOB, + g_param_spec_boolean ("strange-bob", + "Strange Bob", + "Use strange bob", FALSE, G_PARAM_READWRITE | G_PARAM_STATIC_STRINGS) + ); + + dim_class->fields_required = 4; + dim_class->name = "Motion Adaptive: Motion Search"; + dim_class->nick = "tomsmocomp"; + dim_class->latency = 1; + +#ifdef BUILD_X86_ASM + if (cpu_flags & OIL_IMPL_FLAG_MMXEXT) { + dim_class->deinterlace_frame = tomsmocompDScaler_MMXEXT; + } else if (cpu_flags & OIL_IMPL_FLAG_3DNOW) { + dim_class->deinterlace_frame = tomsmocompDScaler_3DNOW; + } else if (cpu_flags & OIL_IMPL_FLAG_MMX) { + dim_class->deinterlace_frame = tomsmocompDScaler_MMX; + } else { + dim_class->deinterlace_frame = tomsmocompDScaler_C; + } +#else + dim_class->deinterlace_frame = tomsmocompDScaler_C; +#endif +} + +static void +gst_deinterlace_method_tomsmocomp_init (GstDeinterlaceMethodTomsMoComp * self) +{ + self->search_effort = 5; + self->strange_bob = FALSE; +} diff --git a/gst/deinterlace/tvtime/tomsmocomp/SearchLoop0A.inc b/gst/deinterlace/tvtime/tomsmocomp/SearchLoop0A.inc new file mode 100644 index 00000000..b1d9aeca --- /dev/null +++ b/gst/deinterlace/tvtime/tomsmocomp/SearchLoop0A.inc @@ -0,0 +1,15 @@ +// -*- c++ -*- + +// Searches just the center pixel, in both the old +// and new fields, but takes averages. This is an even +// pixel address. Any chroma match will be used. (YUY2) +// We best like finding 0 motion so we will bias everything we found previously +// up by a little, and adjust later + +#ifdef IS_SSE2 + "paddusb "_ONES", %%xmm7\n\t" // bias toward no motion +#else + "paddusb "_ONES", %%mm7\n\t" // bias toward no motion +#endif + + MERGE4PIXavg("(%%"XDI", %%"XCX")", "(%%"XSI", %%"XCX")") // center, in old and new diff --git a/gst/deinterlace/tvtime/tomsmocomp/SearchLoopBottom.inc b/gst/deinterlace/tvtime/tomsmocomp/SearchLoopBottom.inc new file mode 100644 index 00000000..e1560353 --- /dev/null +++ b/gst/deinterlace/tvtime/tomsmocomp/SearchLoopBottom.inc @@ -0,0 +1,174 @@ +// -*- c++ -*- + +// Version for non-SSE2 + +#ifndef IS_C + +#ifdef SKIP_SEARCH + "movq %%mm6, %%mm0\n\t" // just use the results of our wierd bob +#else + + + // JA 9/Dec/2002 + // failed experiment + // but leave in placeholder for me to play about +#ifdef DONT_USE_STRANGE_BOB + // Use the best weave if diffs less than 10 as that + // means the image is still or moving cleanly + // if there is motion we will clip which will catch anything + "psubusb "_FOURS", %%mm7\n\t" // sets bits to zero if weave diff < 4 + "pxor %%mm0, %%mm0\n\t" + "pcmpeqb %%mm0, %%mm7\n\t" // all ff where weave better, else 00 + "pcmpeqb %%mm7, %%mm0\n\t" // all ff where bob better, else 00 + "pand %%mm6, %%mm0\n\t" // use bob for these pixel values + "pand %%mm5, %%mm7\n\t" // use weave for these + "por %%mm7, %%mm0\n\t" // combine both +#else + // Use the better of bob or weave + // pminub mm4, TENS // the most we care about + V_PMINUB ("%%mm4", _TENS, "%%mm0") // the most we care about + + "psubusb %%mm4, %%mm7\n\t" // foregive that much from weave est? + "psubusb "_FOURS", %%mm7\n\t" // bias it a bit toward weave + "pxor %%mm0, %%mm0\n\t" + "pcmpeqb %%mm0, %%mm7\n\t" // all ff where weave better, else 00 + "pcmpeqb %%mm7, %%mm0\n\t" // all ff where bob better, else 00 + "pand %%mm6, %%mm0\n\t" // use bob for these pixel values + "pand %%mm5, %%mm7\n\t" // use weave for these + "por %%mm7, %%mm0\n\t" // combine both +#endif + + + // pminub mm0, Max_Vals // but clip to catch the stray error + V_PMINUB ("%%mm0", _Max_Vals, "%%mm1") // but clip to catch the stray error + // pmaxub mm0, Min_Vals + V_PMAXUB ("%%mm0", _Min_Vals) + +#endif + + + MOVX" "_pDest", %%"XAX"\n\t" + +#ifdef USE_VERTICAL_FILTER + "movq %%mm0, %%mm1\n\t" + // pavgb mm0, qword ptr["XBX"] + V_PAVGB ("%%mm0", "(%%"XBX")", "%%mm2", _ShiftMask) + // movntq qword ptr["XAX"+"XDX"], mm0 + V_MOVNTQ ("(%"XAX", %%"XDX")", "%%mm0") + // pavgb mm1, qword ptr["XBX"+"XCX"] + V_PAVGB ("%%mm1", "(%%"XBX", %%"XCX")", "%%mm2", _ShiftMask) + //FIXME: XDX or XAX!! + "addq "_dst_pitchw", %%"XBX + // movntq qword ptr["XAX"+"XDX"], mm1 + V_MOVNTQ ("(%%"XAX", %%"XDX")", "%%mm1") +#else + + // movntq qword ptr["XAX"+"XDX"], mm0 + V_MOVNTQ ("(%%"XAX", %%"XDX")", "%%mm0") +#endif + + LEAX" 8(%%"XDX"), %%"XDX"\n\t" // bump offset pointer + CMPX" "_Last8", %%"XDX"\n\t" // done with line? + "jb 1b\n\t" // y + + MOVX" "_oldbx", %%"XBX"\n\t" + + : /* no outputs */ + + : "m"(pBob), + "m"(src_pitch2), + "m"(ShiftMask), + "m"(pDest), + "m"(dst_pitchw), + "m"(Last8), + "m"(pSrc), + "m"(pSrcP), + "m"(pBobP), + "m"(DiffThres), + "m"(Min_Vals), + "m"(Max_Vals), + "m"(FOURS), + "m"(TENS), + "m"(ONES), + "m"(UVMask), + "m"(Max_Mov), + "m"(YMask), + "m"(oldbx) + + : XAX, XCX, XDX, XSI, XDI, + "st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)", +#ifdef __MMX__ + "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7", +#endif + "memory", "cc" + ); + + // adjust for next line + pSrc += src_pitch2; + pSrcP += src_pitch2; + pDest += dst_pitch2; + pBob += src_pitch2; + pBobP += src_pitch2; + } + + return 0; +#else +#ifdef SKIP_SEARCH + out[0] = best[0]; // just use the results of our wierd bob + out[1] = best[1]; +#else + diff[0] = diff[0] - MIN (diff[0], 10) - 4; + diff[1] = diff[1] - MIN (diff[1] - 10) - 4; + if (diff[0] < 0) + out[0] = weave[0]; + else + out[0] = best[0]; + + if (diff[1] < 0) + out[1] = weave[1]; + else + out[1] = best[1]; + + + out[0] = CLAMP (out[0], MinVals[0], MaxVals[0]); + out[1] = CLAMP (out[1], MinVals[1], MaxVals[1]); +#endif + +#ifdef USE_VERTICAL_FILTER + pDest[x] = (out[0] + pBob[0]) / 2; + pDest[x + dst_pitchw] = (pBob[src_pitch2] + out[0]) / 2; + pDest[x + 1] = (out[1] + pBob[1]) / 2; + pDest[x + 1 + dst_pitchw] = (pBob[src_pitch2 + 1] + out[1]) / 2; +#else + pDest[x] = out[0]; + pDest[x+1] = out[1]; +#endif + pBob += 2; + pBobP += 2; + pSrc += 2; + pSrcP += 2; + } + // adjust for next line + pSrc = src_pitch2 * (y+1) + pWeaveSrc; + pSrcP = src_pitch2 * (y+1) + pWeaveSrcP; + pDest = dst_pitch2 * (y+1) + pWeaveDest + dst_pitch2; + + + if (TopFirst) + { + pBob = pCopySrc + src_pitch2; + pBobP = pCopySrcP + src_pitch2; + } + else + { + pBob = pCopySrc; + pBobP = pCopySrcP; + } + + pBob += src_pitch2 * (y+1); + pBobP += src_pitch2 * (y+1); + } + + return 0; + +#endif diff --git a/gst/deinterlace/tvtime/tomsmocomp/SearchLoopEdgeA.inc b/gst/deinterlace/tvtime/tomsmocomp/SearchLoopEdgeA.inc new file mode 100644 index 00000000..6208fe8c --- /dev/null +++ b/gst/deinterlace/tvtime/tomsmocomp/SearchLoopEdgeA.inc @@ -0,0 +1,11 @@ +// -*- c++ -*- + +// Searches 2 pixel to the left and right, in both the old +// and new fields, but takes averages. These are even +// pixel addresses. Chroma match will be used. (YUY2) + MERGE4PIXavg("-4(%%"XDI")", "4(%%"XSI", %%"XCX", 2)") // up left, down right + MERGE4PIXavg("4(%%"XDI")", "-4(%%"XSI", %%"XCX", 2)") // up right, down left + MERGE4PIXavg("-4(%%"XDI", %%"XCX")", "4(%%"XSI", %%"XCX")") // left, right + MERGE4PIXavg("4(%%"XDI", %%"XCX")", "-4(%%"XSI", %%"XCX")") // right, left + MERGE4PIXavg("-4(%%"XDI", %%"XCX", 2)", "4(%%"XSI")") // down left, up right + MERGE4PIXavg("4(%%"XDI", %%"XCX", 2)", "-4(%%"XSI")") // down right, up left diff --git a/gst/deinterlace/tvtime/tomsmocomp/SearchLoopEdgeA8.inc b/gst/deinterlace/tvtime/tomsmocomp/SearchLoopEdgeA8.inc new file mode 100644 index 00000000..2841c3f6 --- /dev/null +++ b/gst/deinterlace/tvtime/tomsmocomp/SearchLoopEdgeA8.inc @@ -0,0 +1,12 @@ +// -*- c++ -*- + +// Searches 4 pixel to the left and right, in both the old +// and new fields, but takes averages. These are even +// pixel addresses. Chroma match will be used. (YUY2) + MERGE4PIXavg("-8(%%"XDI")", "8(%%"XSI", %%"XCX", 2)") // up left, down right + MERGE4PIXavg("8(%%"XDI")", "-8(%%"XSI", %%"XCX", 2)") // up right, down left + MERGE4PIXavg("-8(%%"XDI", %%"XCX")", "8(%%"XSI", %%"XCX")") // left, right + MERGE4PIXavg("8(%%"XDI", %%"XCX")", "-8(%%"XSI", %%"XCX")") // right, left + MERGE4PIXavg("-8(%%"XDI", %%"XCX", 2)", "8(%%"XSI")") // down left, up right + MERGE4PIXavg("8(%%"XDI", %%"XCX", 2)", "-8(%%"XSI")") // down right, up left + diff --git a/gst/deinterlace/tvtime/tomsmocomp/SearchLoopOddA.inc b/gst/deinterlace/tvtime/tomsmocomp/SearchLoopOddA.inc new file mode 100644 index 00000000..ab5375f4 --- /dev/null +++ b/gst/deinterlace/tvtime/tomsmocomp/SearchLoopOddA.inc @@ -0,0 +1,10 @@ +// -*- c++ -*- + +// Searches 1 pixel to the left and right, in both the old +// and new fields, but takes averages. These are odd +// pixel addresses. Any chroma match will not be used. (YUY2) + MERGE4PIXavg("-2(%%"XDI")", "2(%%"XSI", %%"XCX", 2)") // up left, down right + MERGE4PIXavg("2(%%"XDI")", "-2(%%"XSI", %%"XCX", 2)") // up right, down left + MERGE4PIXavg("-2(%%"XDI", %%"XCX", 2)", "2(%%"XSI")") // down left, up right + MERGE4PIXavg("2(%%"XDI", %%"XCX", 2)", "-2(%%"XSI")") // down right, up left +#include "SearchLoopOddA2.inc" diff --git a/gst/deinterlace/tvtime/tomsmocomp/SearchLoopOddA2.inc b/gst/deinterlace/tvtime/tomsmocomp/SearchLoopOddA2.inc new file mode 100644 index 00000000..fd3f6fb0 --- /dev/null +++ b/gst/deinterlace/tvtime/tomsmocomp/SearchLoopOddA2.inc @@ -0,0 +1,5 @@ +// Searches 1 pixel to the left and right, in both the old +// and new fields, but takes averages. These are odd +// pixel addresses. Any chroma match will not be used. (YUY2) + MERGE4PIXavg("-2(%%"XDI", %%"XCX")", "2(%%"XSI", %%"XCX")") // left, right + MERGE4PIXavg("2(%%"XDI", %%"XCX")", "-2(%%"XSI", %%"XCX")") // right, left diff --git a/gst/deinterlace/tvtime/tomsmocomp/SearchLoopOddA6.inc b/gst/deinterlace/tvtime/tomsmocomp/SearchLoopOddA6.inc new file mode 100644 index 00000000..cbae014e --- /dev/null +++ b/gst/deinterlace/tvtime/tomsmocomp/SearchLoopOddA6.inc @@ -0,0 +1,11 @@ +// -*- c++ -*- + +// Searches 3 pixels to the left and right, in both the old +// and new fields, but takes averages. These are odd +// pixel addresses. Any chroma match will not be used. (YUY2) + MERGE4PIXavg("-6(%%"XDI")", "6(%%"XSI", %%"XCX", 2)") // up left, down right + MERGE4PIXavg("6(%%"XDI")", "-6(%%"XSI", %%"XCX", 2)") // up right, down left + MERGE4PIXavg("-6(%%"XDI", %%"XCX")", "6(%%"XSI", %%"XCX")") // left, right + MERGE4PIXavg("6(%%"XDI", %%"XCX")", "-6(%%"XSI", %%"XCX")") // right, left + MERGE4PIXavg("-6(%%"XDI", %%"XCX", 2)", "6(%%"XSI")") // down left, up right + MERGE4PIXavg("6(%%"XDI", %%"XCX", 2)", "-6(%%"XSI")") // down right, up left diff --git a/gst/deinterlace/tvtime/tomsmocomp/SearchLoopOddAH.inc b/gst/deinterlace/tvtime/tomsmocomp/SearchLoopOddAH.inc new file mode 100644 index 00000000..e59e3c7e --- /dev/null +++ b/gst/deinterlace/tvtime/tomsmocomp/SearchLoopOddAH.inc @@ -0,0 +1,10 @@ +// Searches 1 pixel to the left and right, in both the old
+// and new fields, but takes v-half pel averages. These are odd
+// pixel addresses. Any chroma match will not be used. (YUY2)
+ __asm
+ {
+ MERGE4PIXavgH("XDI"-2, "XDI"+"XCX"-2, "XSI"+"XCX"+2, "XSI"+2*"XCX"+2) // up left, down right
+ MERGE4PIXavgH("XDI"+2, "XDI"+"XCX"+2, "XSI"+"XCX"-2, "XSI"+2*"XCX"-2) // up right, down left
+ MERGE4PIXavgH("XDI"+2*"XCX"-2, "XDI"+"XCX"-2, "XSI"+"XCX"+2, "XSI"+2) // down left, up right
+ MERGE4PIXavgH("XDI"+2*"XCX"+2, "XDI"+"XCX"+2, "XSI"+"XCX"-2, "XSI"-2) // down right, up left
+ }
diff --git a/gst/deinterlace/tvtime/tomsmocomp/SearchLoopOddAH2.inc b/gst/deinterlace/tvtime/tomsmocomp/SearchLoopOddAH2.inc new file mode 100644 index 00000000..cd7d812a --- /dev/null +++ b/gst/deinterlace/tvtime/tomsmocomp/SearchLoopOddAH2.inc @@ -0,0 +1,5 @@ +// Searches 1 pixel to the left and right, in both the old +// and new fields, but takes vertical averages. These are odd +// pixel addresses. Any chroma match will not be used. (YUY2) + MERGE4PIXavgH("-2(%%"XDI", %%"XCX")", "(%%"XDI", %%"XCX")", "(%%"XSI", %%"XCX")", "2(%%"XSI", %%"XCX")") // left, right + MERGE4PIXavgH("2(%%"XDI", %%"XCX")", "(%%"XDI", %%"XCX")", "(%%"XSI", %%"XCX")", "-2(%%"XSI", %%"XCX")") // right, left diff --git a/gst/deinterlace/tvtime/tomsmocomp/SearchLoopTop.inc b/gst/deinterlace/tvtime/tomsmocomp/SearchLoopTop.inc new file mode 100644 index 00000000..9d6a490f --- /dev/null +++ b/gst/deinterlace/tvtime/tomsmocomp/SearchLoopTop.inc @@ -0,0 +1,254 @@ +// -*- c++ -*- + +unsigned char* pDest; +const unsigned char* pSrcP; +const unsigned char* pSrc; +const unsigned char* pBob; +const unsigned char* pBobP; + +// long is int32 on ARCH_368, int64 on ARCH_AMD64. Declaring it this way +// saves a lot of xor's to delete 64bit garbage. + +#if defined(DBL_RESIZE) || defined(USE_FOR_DSCALER) +long src_pitch2 = src_pitch; // even & odd lines are not interleaved in DScaler +#else +long src_pitch2 = 2 * src_pitch; // even & odd lines are interleaved in Avisynth +#endif + + +long dst_pitch2 = 2 * dst_pitch; +long y; + +long Last8; + + pSrc = pWeaveSrc; // points 1 weave line above + pSrcP = pWeaveSrcP; // " + +#ifdef DBL_RESIZE + +#ifdef USE_VERTICAL_FILTER + pDest = pWeaveDest + dst_pitch2; +#else + pDest = pWeaveDest + 3*dst_pitch; +#endif + +#else + +#ifdef USE_VERTICAL_FILTER + pDest = pWeaveDest + dst_pitch; +#else + pDest = pWeaveDest + dst_pitch2; +#endif + +#endif + + if (TopFirst) + { + pBob = pCopySrc + src_pitch2; // remember one weave line just copied previously + pBobP = pCopySrcP + src_pitch2; + } + else + { + pBob = pCopySrc; + pBobP = pCopySrcP; + } + +#ifndef IS_C + +#ifndef _pBob +#define _pBob "%0" +#define _src_pitch2 "%1" +#define _ShiftMask "%2" +#define _pDest "%3" +#define _dst_pitchw "%4" +#define _Last8 "%5" +#define _pSrc "%6" +#define _pSrcP "%7" +#define _pBobP "%8" +#define _DiffThres "%9" +#define _Min_Vals "%10" +#define _Max_Vals "%11" +#define _FOURS "%12" +#define _TENS "%13" +#define _ONES "%14" +#define _UVMask "%15" +#define _Max_Mov "%16" +#define _YMask "%17" +#define _oldbx "%18" +#endif + Last8 = (rowsize-8); + + for (y=1; y < FldHeight-1; y++) + { + long dst_pitchw = dst_pitch; // local stor so asm can ref + int64_t Max_Mov = 0x0404040404040404ull; + int64_t DiffThres = 0x0f0f0f0f0f0f0f0full; + int64_t YMask = 0x00ff00ff00ff00ffull; // keeps only luma + int64_t UVMask = 0xff00ff00ff00ff00ull; // keeps only chroma + int64_t TENS = 0x0a0a0a0a0a0a0a0aull; + int64_t FOURS = 0x0404040404040404ull; + int64_t ONES = 0x0101010101010101ull; + int64_t Min_Vals = 0x0000000000000000ull; + int64_t Max_Vals = 0x0000000000000000ull; + int64_t ShiftMask = 0xfefffefffefffeffull; + + long oldbx; + + // pretend it's indented -->> + __asm__ __volatile__ + ( + // Loop general reg usage + // + // XAX - pBobP, then pDest + // XBX - pBob + // XCX - src_pitch2 + // XDX - current offset + // XDI - prev weave pixels, 1 line up + // XSI - next weave pixels, 1 line up + + // Save "XBX" (-fPIC) + MOVX" %%"XBX", "_oldbx"\n\t" + + // simple bob first 8 bytes + MOVX" "_pBob", %%"XBX"\n\t" + MOVX" "_src_pitch2", %%"XCX"\n\t" + +#ifdef USE_VERTICAL_FILTER + "movq (%%"XBX"), %%mm0\n\t" + "movq (%%"XBX", %%"XCX"), %%mm1\n\t" //, qword ptr["XBX"+"XCX"] + "movq %%mm0, %%mm2\n\t" + V_PAVGB ("%%mm2", "%%mm1", "%%mm3", _ShiftMask) // halfway between + V_PAVGB ("%%mm0", "%%mm2", "%%mm3", _ShiftMask) // 1/4 way + V_PAVGB ("%%mm1", "%%mm2", "%%mm3", _ShiftMask) // 3/4 way + MOVX" "_pDest", %%"XDI"\n\t" + MOVX" "_dst_pitchw", %%"XAX"\n\t" + V_MOVNTQ ("(%%"XDI")", "%%mm0") + V_MOVNTQ ("(%%"XDI", %%"XAX")", "%%mm1") // qword ptr["XDI"+"XAX"], mm1 + + // simple bob last 8 bytes + MOVX" "_Last8", %%"XDX"\n\t" + LEAX" (%%"XBX", %%"XDX"), %%"XSI"\n\t" // ["XBX"+"XDX"] + "movq (%%"XSI"), %%mm0\n\t" + "movq (%%"XSI", %%"XCX"), %%mm1\n\t" // qword ptr["XSI"+"XCX"] + "movq %%mm0, %%mm2\n\t" + V_PAVGB ("%%mm2", "%%mm1", "%%mm3", _ShiftMask) // halfway between + V_PAVGB ("%%mm0", "%%mm2", "%%mm3", _ShiftMask) // 1/4 way + V_PAVGB ("%%mm1", "%%mm2", "%%mm3", _ShiftMask) // 3/4 way + ADDX" %%"XDX", %%"XDI"\n\t" // last 8 bytes of dest + V_MOVNTQ ("%%"XDI"", "%%mm0") + V_MOVNTQ ("(%%"XDI", %%"XAX")", "%%mm1") // qword ptr["XDI"+"XAX"], mm1) + +#else + "movq (%%"XBX"), %%mm0\n\t" + // pavgb mm0, qword ptr["XBX"+"XCX"] + V_PAVGB ("%%mm0", "(%%"XBX", %%"XCX")", "%%mm2", _ShiftMask) // qword ptr["XBX"+"XCX"], mm2, ShiftMask) + MOVX" "_pDest", %%"XDI"\n\t" + V_MOVNTQ ("(%%"XDI")", "%%mm0") + + // simple bob last 8 bytes + MOVX" "_Last8", %%"XDX"\n\t" + LEAX" (%%"XBX", %%"XDX"), %%"XSI"\n\t" //"XSI", ["XBX"+"XDX"] + "movq (%%"XSI"), %%mm0\n\t" + // pavgb mm0, qword ptr["XSI"+"XCX"] + V_PAVGB ("%%mm0", "(%%"XSI", %%"XCX")", "%%mm2", _ShiftMask) // qword ptr["XSI"+"XCX"], mm2, ShiftMask) + V_MOVNTQ ("(%%"XDI", %%"XDX")", "%%mm0") // qword ptr["XDI"+"XDX"], mm0) +#endif + // now loop and get the middle qwords + MOVX" "_pSrc", %%"XSI"\n\t" + MOVX" "_pSrcP", %%"XDI"\n\t" + MOVX" $8, %%"XDX"\n\t" // curr offset longo all lines + + "1:\n\t" + MOVX" "_pBobP", %%"XAX"\n\t" + ADDX" $8, %%"XDI"\n\t" + ADDX" $8, %%"XSI"\n\t" + ADDX" $8, %%"XBX"\n\t" + ADDX" %%"XDX", %%"XAX"\n\t" + +#ifdef USE_STRANGE_BOB +#include "StrangeBob.inc" +#else +#include "WierdBob.inc" +#endif + + // For non-SSE2: + // through out most of the rest of this loop we will maintain + // mm4 our min bob value + // mm5 best weave pixels so far + // mm6 our max Bob value + // mm7 best weighted pixel ratings so far + + // We will keep a slight bias to using the weave pixels + // from the current location, by rating them by the min distance + // from the Bob value instead of the avg distance from that value. + // our best and only rating so far + "pcmpeqb %%mm7, %%mm7\n\t" // ffff, say we didn't find anything good yet + +#else + Last8 = (rowsize - 4); + + for (y=1; y < FldHeight-1; y++) + { + #ifdef USE_STRANGE_BOB + long DiffThres = 0x0f; + #endif + + #ifndef SKIP_SEARCH + long weave[2], MaxVals[2], MinVals[2]; + #endif + + long diff[2], best[2], avg[2], diff2[2], out[2], x; + +#ifdef USE_VERTICAL_FILTER + pDest[0] = (3 * pBob[0] + pBob[src_pitch2]) / 4; + pDest[1] = (3 * pBob[1] + pBob[src_pitch2 + 1]) / 4; + pDest[2] = (3 * pBob[2] + pBob[src_pitch2 + 2]) / 4; + pDest[3] = (3 * pBob[3] + pBob[src_pitch2 + 3]) / 4; + pDest[dst_pitchw] = (pBob[0] + 3 * pBob[src_pitch2]) / 4; + pDest[dst_pitchw + 1] = (pBob[1] + 3 * pBob[src_pitch2 + 1]) / 4; + pDest[dst_pitchw + 2] = (pBob[2] + 3 * pBob[src_pitch2 + 2]) / 4; + pDest[dst_pitchw + 3] = (pBob[3] + 3 * pBob[src_pitch2 + 3]) / 4; + + // simple bob last byte + pDest[Last8] = (3 * pBob[Last8] + pBob[Last8 + src_pitch2]) / 4; + pDest[Last8 + 1] = (3 * pBob[Last8 + 1] + pBob[Last8 + src_pitch2 + 1]) / 4; + pDest[Last8 + 2] = (3 * pBob[Last8 + 2] + pBob[Last8 + src_pitch2 + 2]) / 4; + pDest[Last8 + 3] = (3 * pBob[Last8 + 3] + pBob[Last8 + src_pitch2 + 3]) / 4; + pDest[Last8 + src_pitch2] = (pBob[Last8] + 3 * pBob[Last8 + src_pitch2]) / 4; + pDest[Last8 + src_pitch2 + 1] = (pBob[Last8 + 1] + 3 * pBob[Last8 + src_pitch2 + 1]) / 4; + pDest[Last8 + src_pitch2 + 2] = (pBob[Last8 + 2] + 3 * pBob[Last8 + src_pitch2 + 2]) / 4; + pDest[Last8 + src_pitch2 + 3] = (pBob[Last8 + 3] + 3 * pBob[Last8 + src_pitch2 + 3]) / 4; +#else + pDest[0] = (pBob[0] + pBob[src_pitch2 + 1]) / 2; + pDest[1] = (pBob[1] + pBob[src_pitch2 + 1]) / 2; + pDest[2] = (pBob[2] + pBob[src_pitch2 + 2]) / 2; + pDest[3] = (pBob[3] + pBob[src_pitch2 + 3]) / 2; + + // simple bob last byte + pDest[Last8] = (pBob[Last8] + pBob[Last8 + src_pitch2]) / 2; + pDest[Last8 + 1] = (pBob[Last8 + 1] + pBob[Last8 + src_pitch2 + 1]) / 2; + pDest[Last8 + 2] = (pBob[Last8 + 2] + pBob[Last8 + src_pitch2 + 2]) / 2; + pDest[Last8 + 3] = (pBob[Last8 + 3] + pBob[Last8 + src_pitch2 + 3]) / 2; +#endif + + pBob += 4; + pBobP += 4; + pSrc += 4; + pSrcP += 4; + + for (x=4; x < Last8; x += 2) { + +#ifdef USE_STRANGE_BOB +#include "StrangeBob.inc" +#else +#include "WierdBob.inc" +#endif + + // We will keep a slight bias to using the weave pixels + // from the current location, by rating them by the min distance + // from the Bob value instead of the avg distance from that value. + // our best and only rating so far + diff[0] = diff[1] = 255; + + +#endif diff --git a/gst/deinterlace/tvtime/tomsmocomp/SearchLoopVA.inc b/gst/deinterlace/tvtime/tomsmocomp/SearchLoopVA.inc new file mode 100644 index 00000000..3e3d19b5 --- /dev/null +++ b/gst/deinterlace/tvtime/tomsmocomp/SearchLoopVA.inc @@ -0,0 +1,6 @@ +// -*- c++ -*- + +// Searches the center vertical line above center and below, in both the old +// and new fields, but takes averages. These are even pixel addresses. + MERGE4PIXavg("(%%"XDI", %%"XCX", 2)", "(%%"XSI")") // down, up + MERGE4PIXavg("(%%"XDI")", "(%%"XSI", %%"XCX", 2)") // up, down diff --git a/gst/deinterlace/tvtime/tomsmocomp/SearchLoopVAH.inc b/gst/deinterlace/tvtime/tomsmocomp/SearchLoopVAH.inc new file mode 100644 index 00000000..33155bc1 --- /dev/null +++ b/gst/deinterlace/tvtime/tomsmocomp/SearchLoopVAH.inc @@ -0,0 +1,6 @@ +// -*- c++ -*- + +// Searches the center vertical line above center and below, in both the old +// and new fields, but takes averages. These are even pixel addresses. + MERGE4PIXavgH("(%%"XDI", %%"XCX", 2)", "(%%"XDI", %%"XCX")", "(%%"XSI", %%"XCX")", "(%%"XSI")") // down, up + MERGE4PIXavgH("(%%"XDI")", "(%%"XDI", %%"XCX")", "(%%"XSI", %%"XCX")", "(%%"XSI", %%"XCX", 2)") // up, down diff --git a/gst/deinterlace/tvtime/tomsmocomp/StrangeBob.inc b/gst/deinterlace/tvtime/tomsmocomp/StrangeBob.inc new file mode 100644 index 00000000..45b4c865 --- /dev/null +++ b/gst/deinterlace/tvtime/tomsmocomp/StrangeBob.inc @@ -0,0 +1,435 @@ +// -*- c++ -*- + + // First, get and save our possible Bob values + // Assume our pixels are layed out as follows with x the calc'd bob value + // and the other pixels are from the current field + // + // j a b c k current field + // x calculated line + // m d e f n current field + // + // we calc the bob value luma value as: + // if |j - n| < Thres && |a - m| > Thres + // avg(j,n) + // end if + // if |k - m| < Thres && |c - n| > Thres + // avg(k,m) + // end if + // if |c - d| < Thres && |b - f| > Thres + // avg(c,d) + // end if + // if |a - f| < Thres && |b - d| > Thres + // avg(a,f) + // end if + // if |b - e| < Thres + // avg(b,e) + // end if + // pickup any thing not yet set with avg(b,e) + +#ifndef IS_C + + // j, n + "pxor %%mm5, %%mm5\n\t" + "pxor %%mm6, %%mm6\n\t" + "pxor %%mm7, %%mm7\n\t" + + "movq -2(%%"XBX"), %%mm0\n\t" // value a from top left + "movq -4(%%"XBX", %%"XCX"), %%mm1\n\t" // value m from bottom right + + "movq %%mm0, %%mm3\n\t" + "psubusb %%mm1, %%mm3\n\t" + "psubusb %%mm0, %%mm1\n\t" + "por %%mm1, %%mm3\n\t" // abs(a,m) + + "psubusb "_DiffThres", %%mm3\n\t" // nonzero where abs(a,m) > Thres else 0 + "pxor %%mm4, %%mm4\n\t" + "pcmpeqb %%mm4, %%mm3\n\t" // now ff where abs(a,m) < Thres, else 00 + "pcmpeqb %%mm3, %%mm4\n\t" // here ff where abs(a,m) > Thres, else 00 + + + "movq -4(%%"XBX"), %%mm0\n\t" // value j + "movq 4(%%"XBX", %%"XCX"), %%mm1\n\t" // value n + "movq %%mm0, %%mm2\n\t" + V_PAVGB ("%%mm2", "%%mm1", "%%mm3", _ShiftMask) // avg(j,n) + "movq %%mm0, %%mm3\n\t" + "psubusb %%mm1, %%mm0\n\t" + "psubusb %%mm3, %%mm1\n\t" + "por %%mm1, %%mm0\n\t" // abs(j,n) + + "movq %%mm0, %%mm1\n\t" + "psubusb "_DiffThres", %%mm1\n\t" // nonzero where abs(j,n) > Thres else 0 + "pxor %%mm3, %%mm3\n\t" + "pcmpeqb %%mm3, %%mm1\n\t" // now ff where abs(j,n) < Thres, else 00 + + "pand %%mm4, %%mm1\n\t" + "pand %%mm1, %%mm2\n\t" + "pand %%mm1, %%mm0\n\t" + + "movq %%mm1, %%mm3\n\t" + "pxor %%mm5, %%mm3\n\t" + "pand %%mm3, %%mm6\n\t" + "pand %%mm3, %%mm7\n\t" + "pand %%mm3, %%mm5\n\t" + + "por %%mm1, %%mm5\n\t" + "por %%mm2, %%mm6\n\t" + "por %%mm0, %%mm7\n\t" + + // k & m + "movq 2(%%"XBX"), %%mm0\n\t" // value c from top left + "movq 4(%%"XBX", %%"XCX"), %%mm1\n\t" // value n from bottom right + + "movq %%mm0, %%mm3\n\t" + "psubusb %%mm1, %%mm3\n\t" + "psubusb %%mm0, %%mm1\n\t" + "por %%mm1, %%mm3\n\t" // abs(c,n) + + "psubusb "_DiffThres", %%mm3\n\t" // nonzero where abs(c,n) > Thres else 0 + "pxor %%mm4, %%mm4\n\t" + "pcmpeqb %%mm4, %%mm3\n\t" // now ff where abs(c,n) < Thres, else 00 + "pcmpeqb %%mm3, %%mm4\n\t" // here ff where abs(c,n) > Thres, else 00 + + + "movq 4(%%"XBX"), %%mm0\n\t" // value k + "movq -4(%%"XBX", %%"XCX"), %%mm1\n\t" // value m + "movq %%mm0, %%mm2\n\t" + V_PAVGB ("%%mm2", "%%mm1", "%%mm3", _ShiftMask) // avg(k,m) + "movq %%mm0, %%mm3\n\t" + "psubusb %%mm1, %%mm0\n\t" + "psubusb %%mm3, %%mm1\n\t" + "por %%mm1, %%mm0\n\t" // abs(k,m) + + "movq %%mm0, %%mm1\n\t" + "psubusb "_DiffThres", %%mm1\n\t" // nonzero where abs(k,m) > Thres else 0 + "pxor %%mm3, %%mm3\n\t" + "pcmpeqb %%mm3, %%mm1\n\t" // now ff where abs(k,m) < Thres, else 00 + + "pand %%mm4, %%mm1\n\t" + + "pand %%mm1, %%mm2\n\t" + "pand %%mm1, %%mm0\n\t" + + "movq %%mm1, %%mm3\n\t" + "pxor %%mm5, %%mm3\n\t" + "pand %%mm3, %%mm6\n\t" + "pand %%mm3, %%mm7\n\t" + "pand %%mm3, %%mm5\n\t" + + "por %%mm1, %%mm5\n\t" + "por %%mm2, %%mm6\n\t" + "por %%mm0, %%mm7\n\t" + + + // c & d + "movq (%%"XBX"), %%mm0\n\t" // value b from top left + "movq 2(%%"XBX", %%"XCX"), %%mm1\n\t" // value f from bottom right + + "movq %%mm0, %%mm3\n\t" + "psubusb %%mm1, %%mm3\n\t" + "psubusb %%mm0, %%mm1\n\t" + "por %%mm1, %%mm3\n\t" // abs(b,f) + + "psubusb "_DiffThres", %%mm3\n\t" // nonzero where abs(b,f) > Thres else 0 + "pxor %%mm4, %%mm4\n\t" + "pcmpeqb %%mm4, %%mm3\n\t" // now ff where abs(b,f) < Thres, else 00 + "pcmpeqb %%mm3, %%mm4\n\t" // here ff where abs(b,f) > Thres, else 00 + + "movq 2(%%"XBX"), %%mm0\n\t" // value c + "movq -2(%%"XBX", %%"XCX"), %%mm1\n\t" // value d + "movq %%mm0, %%mm2\n\t" + V_PAVGB ("%%mm2", "%%mm1", "%%mm3", _ShiftMask) // avg(c,d) + "movq %%mm0, %%mm3\n\t" + "psubusb %%mm1, %%mm0\n\t" + "psubusb %%mm3, %%mm1\n\t" + "por %%mm1, %%mm0\n\t" // abs(c,d) + + "movq %%mm0, %%mm1\n\t" + "psubusb "_DiffThres", %%mm1\n\t" // nonzero where abs(c,d) > Thres else 0 + "pxor %%mm3, %%mm3\n\t" + "pcmpeqb %%mm3, %%mm1\n\t" // now ff where abs(c,d) < Thres, else 00 + + "pand %%mm4, %%mm1\n\t" + + "pand %%mm1, %%mm2\n\t" + "pand %%mm1, %%mm0\n\t" + + "movq %%mm1, %%mm3\n\t" + "pxor %%mm5, %%mm3\n\t" + "pand %%mm3, %%mm6\n\t" + "pand %%mm3, %%mm7\n\t" + "pand %%mm3, %%mm5\n\t" + + "por %%mm1, %%mm5\n\t" + "por %%mm2, %%mm6\n\t" + "por %%mm0, %%mm7\n\t" + + // a & f + "movq (%%"XBX"), %%mm0\n\t" // value b from top left + "movq -2(%%"XBX", %%"XCX"), %%mm1\n\t" // value d from bottom right + + "movq %%mm0, %%mm3\n\t" + "psubusb %%mm1, %%mm3\n\t" + "psubusb %%mm0, %%mm1\n\t" + "por %%mm1, %%mm3\n\t" // abs(b,d) + + "psubusb "_DiffThres", %%mm3\n\t" // nonzero where abs(b,d) > Thres else 0 + "pxor %%mm4, %%mm4\n\t" + "pcmpeqb %%mm4, %%mm3\n\t" // now ff where abs(b,d) < Thres, else 00 + "pcmpeqb %%mm3, %%mm4\n\t" // here ff where abs(b,d) > Thres, else 00 + + "movq -2(%%"XBX"), %%mm0\n\t" // value a + "movq 2(%%"XBX", %%"XCX"), %%mm1\n\t" // value f + "movq %%mm0, %%mm2\n\t" + V_PAVGB ("%%mm2", "%%mm1", "%%mm3", _ShiftMask) // avg(a,f) + "movq %%mm0, %%mm3\n\t" + "psubusb %%mm1, %%mm0\n\t" + "psubusb %%mm3, %%mm1\n\t" + "por %%mm1, %%mm0\n\t" // abs(a,f) + + "movq %%mm0, %%mm1\n\t" + "psubusb "_DiffThres", %%mm1\n\t" // nonzero where abs(a,f) > Thres else 0 + "pxor %%mm3, %%mm3\n\t" + "pcmpeqb %%mm3, %%mm1\n\t" // now ff where abs(a,f) < Thres, else 00 + + "pand %%mm4, %%mm1\n\t" + + "pand %%mm1, %%mm2\n\t" + "pand %%mm1, %%mm0\n\t" + + "movq %%mm1, %%mm3\n\t" + "pxor %%mm5, %%mm3\n\t" + "pand %%mm3, %%mm6\n\t" + "pand %%mm3, %%mm7\n\t" + "pand %%mm3, %%mm5\n\t" + + "por %%mm1, %%mm5\n\t" + "por %%mm2, %%mm6\n\t" + "por %%mm0, %%mm7\n\t" + + "pand "_YMask", %%mm5\n\t" // mask out chroma from here + "pand "_YMask", %%mm6\n\t" // mask out chroma from here + "pand "_YMask", %%mm7\n\t" // mask out chroma from here + + // b,e + "movq (%%"XBX"), %%mm0\n\t" // value b from top + "movq (%%"XBX", %%"XCX"), %%mm1\n\t" // value e from bottom + "movq %%mm0, %%mm2\n\t" + V_PAVGB ("%%mm2", "%%mm1", "%%mm3", _ShiftMask) // avg(b,e) + "movq %%mm0, %%mm3\n\t" + "psubusb %%mm1, %%mm0\n\t" + "psubusb %%mm3, %%mm1\n\t" + "por %%mm1, %%mm0\n\t" // abs(b,e) + + "movq %%mm0, %%mm1\n\t" + "psubusb "_DiffThres", %%mm1\n\t" // nonzero where abs(b,e) > Thres else 0 + "pxor %%mm3, %%mm3\n\t" + "pcmpeqb %%mm3, %%mm1\n\t" // now ff where abs(b,e) < Thres, else 00 + + "pand %%mm1, %%mm2\n\t" + "pand %%mm1, %%mm0\n\t" + + "movq %%mm1, %%mm3\n\t" + "pxor %%mm5, %%mm3\n\t" + "pand %%mm3, %%mm6\n\t" + "pand %%mm3, %%mm7\n\t" + "pand %%mm3, %%mm5\n\t" + + "por %%mm1, %%mm5\n\t" + "por %%mm2, %%mm6\n\t" + "por %%mm0, %%mm7\n\t" + + // bob in any leftovers + "movq (%%"XBX"), %%mm0\n\t" // value b from top + "movq (%%"XBX", %%"XCX"), %%mm1\n\t" // value e from bottom + + +// We will also calc here the max/min values to later limit comb +// so the max excursion will not exceed the Max_Comb constant + +#ifdef SKIP_SEARCH + "movq %%mm0, %%mm2\n\t" +// pminub %%mm2, %%mm1 + V_PMINUB ("%%mm2", "%%mm1", "%%mm4") + +// pmaxub %%mm6, %%mm2 // clip our current results so far to be above this + V_PMAXUB ("%%mm6", "%%mm2") + "movq %%mm0, %%mm2\n\t" + V_PMAXUB ("%%mm2", "%%mm1") +// pminub %%mm6, %%mm2 // clip our current results so far to be below this + V_PMINUB ("%%mm6", "%%mm2", "%%mm4") + +#else + "movq %%mm0, %%mm2\n\t" + "movq (%%"XAX"), %%mm4\n\t" + "psubusb %%mm4, %%mm2\n\t" + "psubusb %%mm0, %%mm4\n\t" + "por %%mm2, %%mm4\n\t" // abs diff + + "movq %%mm1, %%mm2\n\t" + "movq (%%"XAX", %%"XCX"), %%mm3\n\t" + "psubusb %%mm3, %%mm2\n\t" + "psubusb %%mm1, %%mm3\n\t" + "por %%mm2, %%mm3\n\t" // abs diff +// pmaxub %%mm3, %%mm4 // top or bottom pixel moved most + V_PMAXUB ("%%mm3", "%%mm4") // top or bottom pixel moved most + "psubusb "_DiffThres", %%mm3\n\t" // moved more than allowed? or goes to 0? + "pxor %%mm4, %%mm4\n\t" + "pcmpeqb %%mm4, %%mm3\n\t" // now ff where low motion, else high motion + + "movq %%mm0, %%mm2\n\t" +// pminub %%mm2, %%mm1 + V_PMINUB ("%%mm2", "%%mm1", "%%mm4") + +// pmaxub %%mm6, %%mm2 // clip our current results so far to be above this + V_PMAXUB ("%%mm6", "%%mm2") + + "psubusb %%mm3, %%mm2\n\t" // maybe decrease it to 0000.. if no surround motion + "movq %%mm2, "_Min_Vals"\n\t" + + "movq %%mm0, %%mm2\n\t" + V_PMAXUB ("%%mm2", "%%mm1") +// pminub %%mm6, %%mm2 // clip our current results so far to be below this + V_PMINUB ("%%mm6", "%%mm2", "%%mm4") + "paddusb %%mm3, %%mm2\n\t" // maybe increase it to ffffff if no surround motion + "movq %%mm2, "_Max_Vals"\n\t" +#endif + + "movq %%mm0, %%mm2\n\t" +// pavgb %%mm2, %%mm1 // avg(b,e) + V_PAVGB ("%%mm2", "%%mm1", "%%mm3", _ShiftMask) // avg(b,e) + + "movq %%mm0, %%mm3\n\t" + "psubusb %%mm1, %%mm3\n\t" + "psubusb %%mm0, %%mm1\n\t" + "por %%mm1, %%mm3\n\t" // abs(b,e) + "movq %%mm3, %%mm1\n\t" // keep copy of diffs + + "pxor %%mm4, %%mm4\n\t" + "psubusb %%mm7, %%mm3\n\t" // nonzero where new weights bigger, else 0 + "pcmpeqb %%mm4, %%mm3\n\t" // now ff where new better, else 00 + "pcmpeqb %%mm0, %%mm0\n\t" + "pandn %%mm0, %%mm5\n\t" + "por %%mm5, %%mm3\n\t" + "pcmpeqb %%mm3, %%mm4\n\t" // here ff where old better, else 00 + + "pand %%mm3, %%mm1\n\t" + "pand %%mm3, %%mm2\n\t" + + "pand %%mm4, %%mm6\n\t" + "pand %%mm4, %%mm7\n\t" + + "por %%mm2, %%mm6\n\t" // our x2 value + "por %%mm1, %%mm7\n\t" // our x2 diffs + "movq %%mm7, %%mm4\n\t" // save as bob uncertainty indicator + +#else + + diff[0] = -1; + diff[1] = -1; + best[0] = 0; + best[1] = 0; + // j, n + if (ABS (pBob[-2] - pBob[src_pitch2 - 4]) < DiffThres && + ABS (pBob[-4] - pBob[src_pitch2 + 4]) > DiffThres) { + best[0] = (pBob[-2] + pBob[src_pitch2 - 4]) / 2; + diff[0] = ABS (pBob[-2] - pBob[src_pitch2 - 4]); + } + if (ABS (pBob[-1] - pBob[src_pitch2 - 3]) < DiffThres && + ABS (pBob[-3] - pBob[src_pitch2 + 5]) > DiffThres) { + best[1] = (pBob[-1] + pBob[src_pitch2 - 3]) / 2; + diff[1] = ABS (pBob[-1] - pBob[src_pitch2 - 3]); + } + + // k & m + if (ABS (pBob[2] - pBob[src_pitch2 + 4]) < DiffThres && + ABS (pBob[4] - pBob[src_pitch2 - 4]) > DiffThres) { + best[0] = (pBob[4] + pBob[src_pitch2 - 4]) / 2; + diff[0] = ABS (pBob[4] - pBob[src_pitch2 - 4]); + } + + if (ABS (pBob[3] - pBob[src_pitch2 + 5]) < DiffThres && + ABS (pBob[5] - pBob[src_pitch2 - 3]) > DiffThres) { + best[1] = (pBob[5] + pBob[src_pitch2 - 3]) / 2; + diff[1] = ABS (pBob[5] - pBob[src_pitch2 - 3]); + } + + // c & d + if (ABS (pBob[0] - pBob[src_pitch2 + 2]) < DiffThres && + ABS (pBob[2] - pBob[src_pitch2 - 2]) > DiffThres) { + best[0] = (pBob[2] + pBob[src_pitch2 - 2]) / 2; + diff[0] = ABS (pBob[2] - pBob[src_pitch2 - 2]); + } + + if (ABS (pBob[1] - pBob[src_pitch2 + 3]) < DiffThres && + ABS (pBob[3] - pBob[src_pitch2 - 1]) > DiffThres) { + best[1] = (pBob[3] + pBob[src_pitch2 - 1]) / 2; + diff[1] = ABS (pBob[3] - pBob[src_pitch2 - 1]); + } + + // a & f + if (ABS (pBob[0] - pBob[src_pitch2 - 2]) < DiffThres && + ABS (pBob[-2] - pBob[src_pitch2 + 2]) > DiffThres) { + best[0] = (pBob[-2] + pBob[src_pitch2 + 2]) / 2; + diff[0] = ABS (pBob[-2] - pBob[src_pitch2 + 2]); + } + + if (ABS (pBob[1] - pBob[src_pitch2 - 1]) < DiffThres && + ABS (pBob[-1] - pBob[src_pitch2 + 3]) > DiffThres) { + best[1] = (pBob[-1] + pBob[src_pitch2 + 3]) / 2; + diff[1] = ABS (pBob[-1] - pBob[src_pitch2 + 3]); + } + + // b,e + if (ABS (pBob[0] - pBob[src_pitch2]) < DiffThres) { + best[0] = (pBob[0] + pBob[src_pitch2]) / 2; + diff[0] = ABS (pBob[0] - pBob[src_pitch2]); + } + + if (ABS (pBob[1] - pBob[src_pitch2 + 1]) < DiffThres) { + best[1] = (pBob[1] + pBob[src_pitch2 + 1]) / 2; + diff[1] = ABS (pBob[1] - pBob[src_pitch2 + 1]); + } + + +// We will also calc here the max/min values to later limit comb +// so the max excursion will not exceed the Max_Comb constant + +#ifdef SKIP_SEARCH + best[0] = CLAMP (best[0], MIN (pBob[src_pitch2], pBob[0]), MAX (pBob[src_pitch2], pBob[0])); + best[1] = CLAMP (best[1], MIN (pBob[src_pitch2 + 1], pBob[1]), MAX (pBob[src_pitch2 + 1], pBob[1])); +#else + mov[0] = MAX (ABS (pBob[0] - pBobP[0]), ABS (pBob[src_pitch2] - pBobP[src_pitch2])); + mov[1] = MAX (ABS (pBob[1] - pBobP[1]), ABS (pBob[src_pitch2 + 1] - pBobP[src_pitch2 + 1])); + + MinVals[0] = 0; + MinVals[1] = 0; + MaxVals[0] = 255; + MaxVals[1] = 255; + if (mov[0] > DiffThres) { + MinVals[0] = MAX (MIN (pBob[0], pBob[src_pitch2]), best[0]); + MaxVals[0] = MIN (MAX (pBob[0], pBob[src_pitch2]), best[0]); + } + + if (mov[1] > DiffThres) { + MinVals[1] = MAX (MIN (pBob[1], pBob[src_pitch2+1]), best[1]); + MaxVals[1] = MIN (MAX (pBob[1], pBob[src_pitch2+1]), best[1]); + } + + best[0] = CLAMP (best[0], MIN (pBob[src_pitch2], pBob[0]), MAX (pBob[src_pitch2], pBob[0])); + best[1] = CLAMP (best[1], MIN (pBob[src_pitch2 + 1], pBob[1]), MAX (pBob[src_pitch2 + 1], pBob[1])); +#endif + avg[0] = (pBob[src_pitch2] + pBob[0]) / 2; + avg[1] = (pBob[src_pitch2 + 1] + pBob[1]) / 2; + diff2[0] = ABS (pBob[src_pitch2 + 1] - pBob[1]); + diff2[1] = ABS (pBob[src_pitch2 + 1] - pBob[1]); + + if (diff[0] == -1 || diff2[0] < diff[0]) { + best[0] = avg[0]; + diff[0] = diff2[0]; + } + + if (diff[1] == -1 || diff2[1] < diff[1]) { + best[1] = avg[1]; + diff[1] = diff2[1]; + } +#endif diff --git a/gst/deinterlace/tvtime/tomsmocomp/TomsMoCompAll.inc b/gst/deinterlace/tvtime/tomsmocomp/TomsMoCompAll.inc new file mode 100644 index 00000000..e8883dd3 --- /dev/null +++ b/gst/deinterlace/tvtime/tomsmocomp/TomsMoCompAll.inc @@ -0,0 +1,241 @@ +/* + * GStreamer + * Copyright (c) 2002 Tom Barry All rights reserved. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public + * License along with this library; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + */ + +/* + * Relicensed for GStreamer from GPL to LGPL with permit from Tom Barry. + * See: http://bugzilla.gnome.org/show_bug.cgi?id=163578 + */ + + +#ifndef TopFirst +#define TopFirst IsOdd +#endif + +#ifdef SEFUNC +#undef SEFUNC +#endif + +#if defined(IS_MMXEXT) +#define SEFUNC(x) Search_Effort_MMXEXT_##x(int src_pitch, int dst_pitch, int rowsize, const unsigned char *pWeaveSrc, const unsigned char *pWeaveSrcP, unsigned char *pWeaveDest, int IsOdd, const unsigned char *pCopySrc, const unsigned char *pCopySrcP, int FldHeight) +#elif defined(IS_3DNOW) +#define SEFUNC(x) Search_Effort_3DNOW_##x(int src_pitch, int dst_pitch, int rowsize, const unsigned char *pWeaveSrc, const unsigned char *pWeaveSrcP, unsigned char *pWeaveDest, int IsOdd, const unsigned char *pCopySrc, const unsigned char *pCopySrcP, int FldHeight) +#elif defined(IS_MMX) +#define SEFUNC(x) Search_Effort_MMX_##x(int src_pitch, int dst_pitch, int rowsize, const unsigned char *pWeaveSrc, const unsigned char *pWeaveSrcP, unsigned char *pWeaveDest, int IsOdd, const unsigned char *pCopySrc, const unsigned char *pCopySrcP, int FldHeight) +#else +#define SEFUNC(x) Search_Effort_C_##x(int src_pitch, int dst_pitch, int rowsize, const unsigned char *pWeaveSrc, const unsigned char *pWeaveSrcP, unsigned char *pWeaveDest, int IsOdd, const unsigned char *pCopySrc, const unsigned char *pCopySrcP, int FldHeight) +#endif + +#include "TomsMoCompAll2.inc" + +#define USE_STRANGE_BOB + +#include "TomsMoCompAll2.inc" + +#undef USE_STRANGE_BOB + +#undef SEFUNC +#if defined(IS_MMXEXT) +#define SEFUNC(x) Search_Effort_MMXEXT_##x(src_pitch, dst_pitch, rowsize, pWeaveSrc, pWeaveSrcP, pWeaveDest, IsOdd, pCopySrc, pCopySrcP, FldHeight) +#elif defined(IS_3DNOW) +#define SEFUNC(x) Search_Effort_3DNOW_##x(src_pitch, dst_pitch, rowsize, pWeaveSrc, pWeaveSrcP, pWeaveDest, IsOdd, pCopySrc, pCopySrcP, FldHeight) +#elif defined(IS_MMX) +#define SEFUNC(x) Search_Effort_MMX_##x(src_pitch, dst_pitch, rowsize, pWeaveSrc, pWeaveSrcP, pWeaveDest, IsOdd, pCopySrc, pCopySrcP, FldHeight) +#else +#define SEFUNC(x) Search_Effort_C_##x(src_pitch, dst_pitch, rowsize, pWeaveSrc, pWeaveSrcP, pWeaveDest, IsOdd, pCopySrc, pCopySrcP, FldHeight) +#endif + +void FUNCT_NAME(GstDeinterlaceMethod *d_method, GstDeinterlace* object, GstBuffer *outbuf) +{ + GstDeinterlaceMethodTomsMoComp *self = GST_DEINTERLACE_METHOD_TOMSMOCOMP (d_method); + long SearchEffort = self->search_effort; + int UseStrangeBob = self->strange_bob; + int IsOdd; + const unsigned char *pWeaveSrc; + const unsigned char *pWeaveSrcP; + unsigned char *pWeaveDest; + const unsigned char *pCopySrc; + const unsigned char *pCopySrcP; + unsigned char *pCopyDest; + int src_pitch; + int dst_pitch; + int rowsize; + int FldHeight; + + /* double stride do address just every odd/even scanline */ + src_pitch = object->field_stride; + dst_pitch = object->row_stride; + rowsize = object->row_stride; + FldHeight = object->field_height; + + pCopySrc = GST_BUFFER_DATA(object->field_history[object->history_count-1].buf); + pCopySrcP = GST_BUFFER_DATA(object->field_history[object->history_count-3].buf); + pWeaveSrc = GST_BUFFER_DATA(object->field_history[object->history_count-2].buf); + pWeaveSrcP = GST_BUFFER_DATA(object->field_history[object->history_count-4].buf); + + /* use bottom field and interlace top field */ + if (object->field_history[object->history_count-2].flags == PICTURE_INTERLACED_BOTTOM) { + IsOdd = 1; + + // if we have an odd field we copy an even field and weave an odd field + pCopyDest = GST_BUFFER_DATA(outbuf); + pWeaveDest = pCopyDest + dst_pitch; + } + /* do it vice verca */ + else { + + IsOdd = 0; + // if we have an even field we copy an odd field and weave an even field + pCopyDest = GST_BUFFER_DATA(outbuf) + dst_pitch; + pWeaveDest = GST_BUFFER_DATA(outbuf); + } + + + // copy 1st and last weave lines + Fieldcopy(pWeaveDest, pCopySrc, rowsize, + 1, dst_pitch*2, src_pitch); + Fieldcopy(pWeaveDest+(FldHeight-1)*dst_pitch*2, + pCopySrc+(FldHeight-1)*src_pitch, rowsize, + 1, dst_pitch*2, src_pitch); + +#ifdef USE_VERTICAL_FILTER + // Vertical Filter currently not implemented for DScaler !! + // copy 1st and last lines the copy field + Fieldcopy(pCopyDest, pCopySrc, rowsize, + 1, dst_pitch*2, src_pitch); + Fieldcopy(pCopyDest+(FldHeight-1)*dst_pitch*2, + pCopySrc+(FldHeight-1)*src_pitch, rowsize, + 1, dst_pitch*2, src_pitch); +#else + + // copy all of the copy field + Fieldcopy(pCopyDest, pCopySrc, rowsize, + FldHeight, dst_pitch*2, src_pitch); +#endif + // then go fill in the hard part, being variously lazy depending upon + // SearchEffort + + if(!UseStrangeBob) { + if (SearchEffort == 0) + { + SEFUNC(0); + } + else if (SearchEffort <= 1) + { + SEFUNC(1); + } + /* else if (SearchEffort <= 2) + { + SEFUNC(2); + } + */ + else if (SearchEffort <= 3) + { + SEFUNC(3); + } + else if (SearchEffort <= 5) + { + SEFUNC(5); + } + else if (SearchEffort <= 9) + { + SEFUNC(9); + } + else if (SearchEffort <= 11) + { + SEFUNC(11); + } + else if (SearchEffort <= 13) + { + SEFUNC(13); + } + else if (SearchEffort <= 15) + { + SEFUNC(15); + } + else if (SearchEffort <= 19) + { + SEFUNC(19); + } + else if (SearchEffort <= 21) + { + SEFUNC(21); + } + else + { + SEFUNC(Max); + } + } + else + { + if (SearchEffort == 0) + { + SEFUNC(0SB); + } + else if (SearchEffort <= 1) + { + SEFUNC(1SB); + } + /* else if (SearchEffort <= 2) + { + SEFUNC(2SB); + } + */ + else if (SearchEffort <= 3) + { + SEFUNC(3SB); + } + else if (SearchEffort <= 5) + { + SEFUNC(5SB); + } + else if (SearchEffort <= 9) + { + SEFUNC(9SB); + } + else if (SearchEffort <= 11) + { + SEFUNC(11SB); + } + else if (SearchEffort <= 13) + { + SEFUNC(13SB); + } + else if (SearchEffort <= 15) + { + SEFUNC(15SB); + } + else if (SearchEffort <= 19) + { + SEFUNC(19SB); + } + else if (SearchEffort <= 21) + { + SEFUNC(21SB); + } + else + { + SEFUNC(MaxSB); + } + } + +#if defined(BUILD_X86_ASM) && !defined(IS_C) + __asm__ __volatile__("emms"); +#endif +} diff --git a/gst/deinterlace/tvtime/tomsmocomp/TomsMoCompAll2.inc b/gst/deinterlace/tvtime/tomsmocomp/TomsMoCompAll2.inc new file mode 100644 index 00000000..f6344eab --- /dev/null +++ b/gst/deinterlace/tvtime/tomsmocomp/TomsMoCompAll2.inc @@ -0,0 +1,243 @@ +// -*- c++ -*- + +#ifdef SEARCH_EFFORT_FUNC +#undef SEARCH_EFFORT_FUNC +#endif + +#ifdef USE_STRANGE_BOB +#define SEARCH_EFFORT_FUNC(n) SEFUNC(n##SB) +#else +#define SEARCH_EFFORT_FUNC(n) SEFUNC(n) +#endif + +static inline int SEARCH_EFFORT_FUNC(0) // we don't try at all ;-) +{ + //see Search_Effort_Max() for comments +#define SKIP_SEARCH +#include "SearchLoopTop.inc" +#include "SearchLoopBottom.inc" +#undef SKIP_SEARCH +} + +static inline int SEARCH_EFFORT_FUNC(1) +{ +#ifdef IS_C +#define SKIP_SEARCH +#include "SearchLoopTop.inc" +#include "SearchLoopBottom.inc" +#undef SKIP_SEARCH +#else + //see Search_Effort_Max() for comments +#include "SearchLoopTop.inc" + RESET_CHROMA // pretend chroma diffs was 255 each +#include "SearchLoop0A.inc" +#include "SearchLoopBottom.inc" +#endif +} + +static inline int SEARCH_EFFORT_FUNC(3) +{ +#ifdef IS_C +#define SKIP_SEARCH +#include "SearchLoopTop.inc" +#include "SearchLoopBottom.inc" +#undef SKIP_SEARCH +#else + //see Search_Effort_Max() for comments +#include "SearchLoopTop.inc" +#include "SearchLoopOddA2.inc" + RESET_CHROMA // pretend chroma diffs was 255 each +#include "SearchLoop0A.inc" +#include "SearchLoopBottom.inc" +#endif +} + +static inline int SEARCH_EFFORT_FUNC(5) +{ +#ifdef IS_C +#define SKIP_SEARCH +#include "SearchLoopTop.inc" +#include "SearchLoopBottom.inc" +#undef SKIP_SEARCH +#else + //see Search_Effort_Max() for comments +#include "SearchLoopTop.inc" +#include "SearchLoopOddA2.inc" +#include "SearchLoopOddAH2.inc" + RESET_CHROMA // pretend chroma diffs was 255 each +#include "SearchLoop0A.inc" +#include "SearchLoopBottom.inc" +#endif +} + +// 3x3 search +static inline int SEARCH_EFFORT_FUNC(9) +{ +#ifdef IS_C +#define SKIP_SEARCH +#include "SearchLoopTop.inc" +#include "SearchLoopBottom.inc" +#undef SKIP_SEARCH +#else + //see SearchEffortMax() for comments +#include "SearchLoopTop.inc" +#include "SearchLoopOddA.inc" + RESET_CHROMA // pretend chroma diffs was 255 each +#include "SearchLoopVA.inc" +#include "SearchLoop0A.inc" +#include "SearchLoopBottom.inc" +#endif +} + +// Search 9 with 2 H-half pels added +static inline int SEARCH_EFFORT_FUNC(11) +{ +#ifdef IS_C +#define SKIP_SEARCH +#include "SearchLoopTop.inc" +#include "SearchLoopBottom.inc" +#undef SKIP_SEARCH +#else + //see SearchEffortMax() for comments +#include "SearchLoopTop.inc" +#include "SearchLoopOddA.inc" +#include "SearchLoopOddAH2.inc" + RESET_CHROMA // pretend chroma diffs was 255 each +#include "SearchLoopVA.inc" +#include "SearchLoop0A.inc" +#include "SearchLoopBottom.inc" +#endif +} + +// Search 11 with 2 V-half pels added +static inline int SEARCH_EFFORT_FUNC(13) +{ +#ifdef IS_C +#define SKIP_SEARCH +#include "SearchLoopTop.inc" +#include "SearchLoopBottom.inc" +#undef SKIP_SEARCH +#else + //see SearchEffortMax() for comments +#include "SearchLoopTop.inc" +#include "SearchLoopOddA.inc" +#include "SearchLoopOddAH2.inc" + RESET_CHROMA // pretend chroma diffs was 255 each +#include "SearchLoopVAH.inc" +#include "SearchLoopVA.inc" +#include "SearchLoop0A.inc" +#include "SearchLoopBottom.inc" +#endif +} + +// 5x3 +static inline int SEARCH_EFFORT_FUNC(15) +{ +#ifdef IS_C +#define SKIP_SEARCH +#include "SearchLoopTop.inc" +#include "SearchLoopBottom.inc" +#undef SKIP_SEARCH +#else + //see SearchEffortMax() for comments +#include "SearchLoopTop.inc" +#include "SearchLoopOddA.inc" + RESET_CHROMA // pretend chroma diffs was 255 each +#include "SearchLoopEdgeA.inc" +#include "SearchLoopVA.inc" +#include "SearchLoop0A.inc" +#include "SearchLoopBottom.inc" +#endif +} + +// 5x3 + 4 half pels +static inline int SEARCH_EFFORT_FUNC(19) +{ +#ifdef IS_C +#define SKIP_SEARCH +#include "SearchLoopTop.inc" +#include "SearchLoopBottom.inc" +#undef SKIP_SEARCH +#else + //see SearchEffortMax() for comments +#include "SearchLoopTop.inc" +#include "SearchLoopOddA.inc" +#include "SearchLoopOddAH2.inc" + RESET_CHROMA // pretend chroma diffs was 255 each +#include "SearchLoopEdgeA.inc" +#include "SearchLoopVAH.inc" +#include "SearchLoopVA.inc" +#include "SearchLoop0A.inc" +#include "SearchLoopBottom.inc" +#endif +} + +// Handle one 4x1 block of pixels +// Search a 7x3 area, no half pels + +static inline int SEARCH_EFFORT_FUNC(21) +{ +#ifdef IS_C +#define SKIP_SEARCH +#include "SearchLoopTop.inc" +#include "SearchLoopBottom.inc" +#undef SKIP_SEARCH +#else + //see SearchLoopTop.inc for comments +#include "SearchLoopTop.inc" + + // odd addresses -- the pixels at odd address wouldn't generate + // good luma values but we will mask those off + +#include "SearchLoopOddA6.inc" // 4 odd v half pels, 3 to left & right +#include "SearchLoopOddA.inc" // 6 odd pels, 1 to left & right + + RESET_CHROMA // pretend chroma diffs was 255 each + + // even addresses -- use both luma and chroma from these + // search averages of 2 pixels left and right +#include "SearchLoopEdgeA.inc" + // search vertical line and averages, -1,0,+1 +#include "SearchLoopVA.inc" + // blend our results and loop +#include "SearchLoop0A.inc" +#include "SearchLoopBottom.inc" +#endif +} + +// Handle one 4x1 block of pixels +// Search a 9x3 area, no half pels +static inline int SEARCH_EFFORT_FUNC(Max) +{ +#ifdef IS_C +#define SKIP_SEARCH +#include "SearchLoopTop.inc" +#include "SearchLoopBottom.inc" +#undef SKIP_SEARCH +#else + //see SearchLoopTop.inc for comments +#include "SearchLoopTop.inc" + + // odd addresses -- the pixels at odd address wouldn't generate + // good luma values but we will mask those off + +#include "SearchLoopOddA6.inc" // 4 odd v half pels, 3 to left & right +#include "SearchLoopOddA.inc" // 6 odd pels, 1 to left & right + + RESET_CHROMA // pretend chroma diffs was 255 each + + // even addresses -- use both luma and chroma from these + // search averages of 4 pixels left and right +#include "SearchLoopEdgeA8.inc" + // search averages of 2 pixels left and right +#include "SearchLoopEdgeA.inc" + // search vertical line and averages, -1,0,+1 +#include "SearchLoopVA.inc" + // blend our results and loop +#include "SearchLoop0A.inc" +#include "SearchLoopBottom.inc" +#endif +} + +#undef SEARCH_EFFORT_FUNC + diff --git a/gst/deinterlace/tvtime/tomsmocomp/WierdBob.inc b/gst/deinterlace/tvtime/tomsmocomp/WierdBob.inc new file mode 100644 index 00000000..f4bbb830 --- /dev/null +++ b/gst/deinterlace/tvtime/tomsmocomp/WierdBob.inc @@ -0,0 +1,286 @@ +// -*- c++ -*- + + // First, get and save our possible Bob values + // Assume our pixels are layed out as follows with x the calc'd bob value + // and the other pixels are from the current field + // + // j a b c k current field + // x calculated line + // m d e f n current field + // + // we calc the bob value as: + // x2 = either avg(a,f), avg(c,d), avg(b,e), avg(j,n), or avg(k,m) + + // selected for the smallest of abs(a,f), abs(c,d), or abs(b,e), etc. + +#ifndef IS_C + // a,f + "movq -2(%%"XBX"), %%mm0\n\t" // value a from top left + "movq 2(%%"XBX", %%"XCX"), %%mm1\n\t" // value f from bottom right + "movq %%mm0, %%mm6\n\t" +// pavgb %%mm6, %%mm1 // avg(a,f), also best so far + V_PAVGB ("%%mm6", "%%mm1", "%%mm7", _ShiftMask) // avg(a,f), also best so far + "movq %%mm0, %%mm7\n\t" + "psubusb %%mm1, %%mm7\n\t" + "psubusb %%mm0, %%mm1\n\t" + "por %%mm1, %%mm7\n\t" // abs diff, also best so far + + // c,d + "movq 2(%%"XBX"), %%mm0\n\t" // value a from top left + "movq -2(%%"XBX", %%"XCX"), %%mm1\n\t" // value f from bottom right + "movq %%mm0, %%mm2\n\t" +// pavgb %%mm2, %%mm1 // avg(c,d) + V_PAVGB ("%%mm2", "%%mm1", "%%mm3", _ShiftMask) // avg(c,d) + "movq %%mm0, %%mm3\n\t" + "psubusb %%mm1, %%mm3\n\t" + "psubusb %%mm0, %%mm1\n\t" + "por %%mm1, %%mm3\n\t" // abs(c,d) + "movq %%mm3, %%mm1\n\t" // keep copy + + "psubusb %%mm7, %%mm3\n\t" // nonzero where new weights bigger, else 0 + "pxor %%mm4, %%mm4\n\t" + "pcmpeqb %%mm4, %%mm3\n\t" // now ff where new better, else 00 + "pcmpeqb %%mm3, %%mm4\n\t" // here ff where old better, else 00 + + "pand %%mm3, %%mm1\n\t" // keep only better new avg and abs + "pand %%mm3, %%mm2\n\t" + + "pand %%mm4, %%mm6\n\t" + "pand %%mm4, %%mm7\n\t" + + "por %%mm2, %%mm6\n\t" // and merge new & old vals keeping best + "por %%mm1, %%mm7\n\t" + "por "_UVMask", %%mm7\n\t" // but we know chroma is worthless so far + "pand "_YMask", %%mm5\n\t" // mask out chroma from here also + + // j,n + "movq -4(%%"XBX"), %%mm0\n\t" // value j from top left + "movq 4(%%"XBX", %%"XCX"), %%mm1\n\t" // value n from bottom right + "movq %%mm0, %%mm2\n\t" +// pavgb %%mm2, %%mm1 // avg(j,n) + V_PAVGB ("%%mm2", "%%mm1", "%%mm3", _ShiftMask) // avg(j,n) + "movq %%mm0, %%mm3\n\t" + "psubusb %%mm1, %%mm3\n\t" + "psubusb %%mm0, %%mm1\n\t" + "por %%mm1, %%mm3\n\t" // abs(j-n) + "movq %%mm3, %%mm1\n\t" // keep copy + + "psubusb %%mm7, %%mm3\n\t" // nonzero where new weights bigger, else 0 + "pxor %%mm4, %%mm4\n\t" + "pcmpeqb %%mm4, %%mm3\n\t" // now ff where new better, else 00 + "pcmpeqb %%mm3, %%mm4\n\t" // here ff where old better, else 00 + + "pand %%mm3, %%mm1\n\t" // keep only better new avg and abs + "pand %%mm2, %%mm3\n\t" + + "pand %%mm4, %%mm6\n\t" + "pand %%mm4, %%mm7\n\t" + + "por %%mm3, %%mm6\n\t" // and merge new & old vals keeping best + "por %%mm1, %%mm7\n\t" // " + + // k, m + "movq 4(%%"XBX"), %%mm0\n\t" // value k from top right + "movq -4(%%"XBX", %%"XCX"), %%mm1\n\t" // value n from bottom left + "movq %%mm0, %%mm4\n\t" +// pavgb %%mm4, %%mm1 // avg(k,m) + V_PAVGB ("%%mm4", "%%mm1", "%%mm3", _ShiftMask) // avg(k,m) + + "movq %%mm0, %%mm3\n\t" + "psubusb %%mm1, %%mm3\n\t" + "psubusb %%mm0, %%mm1\n\t" + "por %%mm1, %%mm3\n\t" // abs(k,m) + "movq %%mm3, %%mm1\n\t" // keep copy + + "movq %%mm4, %%mm2\n\t" // avg(k,m) + + "psubusb %%mm7, %%mm3\n\t" // nonzero where new weights bigger, else 0 + "pxor %%mm4, %%mm4\n\t" + "pcmpeqb %%mm4, %%mm3\n\t" // now ff where new better, else 00 + "pcmpeqb %%mm3, %%mm4\n\t" // here ff where old better, else 00 + + "pand %%mm3, %%mm1\n\t" // keep only better new avg and abs + "pand %%mm2, %%mm3\n\t" + + "pand %%mm4, %%mm6\n\t" + "pand %%mm4, %%mm7\n\t" + + "por %%mm3, %%mm6\n\t" // and merge new & old vals keeping best + "por %%mm1, %%mm7\n\t" // " + + // b,e + "movq (%%"XBX"), %%mm0\n\t" // value b from top + "movq (%%"XBX", %%"XCX"), %%mm1\n\t" // value e from bottom + +// We will also calc here the max/min values to later limit comb +// so the max excursion will not exceed the Max_Comb constant + +#ifdef SKIP_SEARCH + "movq %%mm0, %%mm2\n\t" +// pminub %%mm2, %%mm1 + V_PMINUB ("%%mm2", "%%mm1", "%%mm4") + +// pmaxub %%mm6, %%mm2 // clip our current results so far to be above this + V_PMAXUB ("%%mm6", "%%mm2") + "movq %%mm0, %%mm2\n\t" + V_PMAXUB ("%%mm2", "%%mm1") +// pminub %%mm6, %%mm2 // clip our current results so far to be below this + V_PMINUB ("%%mm6", "%%mm2", "%%mm4") + +#else + "movq %%mm0, %%mm2\n\t" + "movq (%%"XAX"), %%mm4\n\t" + "psubusb %%mm4, %%mm2\n\t" + "psubusb %%mm0, %%mm4\n\t" + "por %%mm2, %%mm4\n\t" // abs diff + + "movq %%mm1, %%mm2\n\t" + "movq (%%"XAX", %%"XCX"), %%mm3\n\t" + "psubusb %%mm3, %%mm2\n\t" + "psubusb %%mm1, %%mm3\n\t" + "por %%mm2, %%mm3\n\t" // abs diff +// pmaxub %%mm3, %%mm4 // top or bottom pixel moved most + V_PMAXUB ("%%mm3", "%%mm4") // top or bottom pixel moved most + "psubusb "_Max_Mov", %%mm3\n\t" // moved more than allowed? or goes to 0? + "pxor %%mm4, %%mm4\n\t" + "pcmpeqb %%mm4, %%mm3\n\t" // now ff where low motion, else high motion + + "movq %%mm0, %%mm2\n\t" +// pminub %%mm2, %%mm1 + V_PMINUB ("%%mm2", "%%mm1", "%%mm4") + +// pmaxub %%mm6, %%mm2 // clip our current results so far to be above this + V_PMAXUB ("%%mm6", "%%mm2") + + "psubusb %%mm3, %%mm2\n\t" // maybe decrease it to 0000.. if no surround motion + "movq %%mm2, "_Min_Vals"\n\t" + + "movq %%mm0, %%mm2\n\t" + V_PMAXUB ("%%mm2", "%%mm1") +// pminub %%mm6, %%mm2 // clip our current results so far to be below this + V_PMINUB ("%%mm6", "%%mm2", "%%mm4") + "paddusb %%mm3, %%mm2\n\t" // maybe increase it to ffffff if no surround motion + "movq %%mm2, "_Max_Vals"\n\t" +#endif + + "movq %%mm0, %%mm2\n\t" +// pavgb %%mm2, %%mm1 // avg(b,e) + V_PAVGB ("%%mm2", "%%mm1", "%%mm3", _ShiftMask) // avg(b,e) + + "movq %%mm0, %%mm3\n\t" + "psubusb %%mm1, %%mm3\n\t" + "psubusb %%mm0, %%mm1\n\t" + "por %%mm1, %%mm3\n\t" // abs(c,d) + "movq %%mm3, %%mm1\n\t" // keep copy of diffs + + "pxor %%mm4, %%mm4\n\t" + "psubusb %%mm7, %%mm3\n\t" // nonzero where new weights bigger, else 0 + "pcmpeqb %%mm4, %%mm3\n\t" // now ff where new better, else 00 + + "pcmpeqb %%mm3, %%mm4\n\t" // here ff where old better, else 00 + + "pand %%mm3, %%mm1\n\t" + "pand %%mm3, %%mm2\n\t" + + "pand %%mm4, %%mm6\n\t" + "pand %%mm4, %%mm7\n\t" + + "por %%mm2, %%mm6\n\t" // our x2 value + "por %%mm1, %%mm7\n\t" // our x2 diffs + "movq %%mm7, %%mm4\n\t" // save as bob uncertainty indicator + +#else + + // a,f + best[0] = (pBob[-2] + pBob[src_pitch2 + 2]) / 2; + diff[0] = ABS (pBob[-2] - pBob[src_pitch2 + 2]); + best[1] = (pBob[-1] + pBob[src_pitch2 + 3]) / 2; + diff[1] = ABS (pBob[-1] - pBob[src_pitch2 + 3]); + + // c,d + if (ABS (pBob[2] - pBob[src_pitch2 - 2]) < diff[0]) { + best[0] = (pBob[2] + pBob[src_pitch2 - 2]) / 2; + diff[0] = ABS (pBob[2] - pBob[src_pitch2 - 2]); + } + + if (ABS (pBob[3] - pBob[src_pitch2 - 1]) < diff[1]) { + best[1] = (pBob[3] + pBob[src_pitch2 - 1]) / 2; + diff[1] = ABS (pBob[3] - pBob[src_pitch2 - 1]); + } + + // j,n + if (ABS (pBob[-4] - pBob[src_pitch2 + 4]) < diff[0]) { + best[0] = (pBob[-4] + pBob[src_pitch2 + 4]) / 2; + diff[0] = ABS (pBob[-4] - pBob[src_pitch2 + 4]); + } + + if (ABS (pBob[-3] - pBob[src_pitch2 + 5]) < diff[1]) { + best[1] = (pBob[-3] + pBob[src_pitch2 + 5]) / 2; + diff[1] = ABS (pBob[-3] - pBob[src_pitch2 + 5]); + } + + // k,m + if (ABS (pBob[4] - pBob[src_pitch2 - 4]) < diff[0]) { + best[0] = (pBob[4] + pBob[src_pitch2 - 4]) / 2; + diff[0] = ABS (pBob[-4] - pBob[src_pitch2 - 4]); + } + + if (ABS (pBob[5] - pBob[src_pitch2 - 3]) < diff[1]) { + best[1] = (pBob[5] + pBob[src_pitch2 - 3]) / 2; + diff[1] = ABS (pBob[-3] - pBob[src_pitch2 - 3]); + } + // k,m + if (ABS (pBob[4] - pBob[src_pitch2 - 4]) < diff[0]) { + best[0] = (pBob[4] + pBob[src_pitch2 - 4]) / 2; + diff[0] = ABS (pBob[-4] - pBob[src_pitch2 - 4]); + } + + if (ABS (pBob[5] - pBob[src_pitch2 - 3]) < diff[1]) { + best[1] = (pBob[5] + pBob[src_pitch2 - 3]) / 2; + diff[1] = ABS (pBob[-3] - pBob[src_pitch2 - 3]); + } + +// We will also calc here the max/min values to later limit comb +// so the max excursion will not exceed the Max_Comb constant + +#ifdef SKIP_SEARCH + best[0] = CLAMP (best[0], MIN (pBob[src_pitch2], pBob[0]), MAX (pBob[src_pitch2], pBob[0])); + best[1] = CLAMP (best[1], MIN (pBob[src_pitch2 + 1], pBob[1]), MAX (pBob[src_pitch2 + 1], pBob[1])); +#else + mov[0] = MAX (ABS (pBob[0] - pBobP[0]), ABS (pBob[src_pitch2] - pBobP[src_pitch2])); + mov[1] = MAX (ABS (pBob[1] - pBobP[1]), ABS (pBob[src_pitch2 + 1] - pBobP[src_pitch2 + 1])); + + MinVals[0] = 0; + MinVals[1] = 0; + MaxVals[0] = 255; + MaxVals[1] = 255; + + if (mov[0] > Max_Mov[0]) { + MinVals[0] = MAX (MIN (pBob[0], pBob[src_pitch2]), best[0]); + MaxVals[0] = MIN (MAX (pBob[0], pBob[src_pitch2]), best[0]); + } + + if (mov[1] > Max_Mov[1]) { + MinVals[1] = MAX (MIN (pBob[1], pBob[src_pitch2 + 1]), best[1]); + MaxVals[1] = MIN (MAX (pBob[1], pBob[src_pitch2 + 1]), best[1]); + } + + best[0] = CLAMP (best[0], MIN (pBob[src_pitch2], pBob[0]), MAX (pBob[src_pitch2], pBob[0])); + best[1] = CLAMP (best[1], MIN (pBob[src_pitch2 + 1], pBob[1]), MAX (pBob[src_pitch2 + 1], pBob[1])); +#endif + + avg[0] = (pBob[src_pitch2] + pBob[0]) / 2; + avg[1] = (pBob[src_pitch2 + 1] + pBob[1]) / 2; + diff2[0] = ABS (pBob[src_pitch2] - pBob[0]); + diff2[1] = ABS (pBob[src_pitch2 + 1] - pBob[1]); + + if (diff2[0] < diff[0]) { + best[0] = avg[0]; + diff[0] = diff2[0]; + } + + if (diff2[1] < diff[1]) { + best[1] = avg[1]; + diff[1] = diff2[1]; + } +#endif diff --git a/gst/deinterlace/tvtime/tomsmocomp/tomsmocompmacros.h b/gst/deinterlace/tvtime/tomsmocomp/tomsmocompmacros.h new file mode 100644 index 00000000..7e8147ec --- /dev/null +++ b/gst/deinterlace/tvtime/tomsmocomp/tomsmocompmacros.h @@ -0,0 +1,164 @@ +#include <string.h> +#include <math.h> + +// Define a few macros for CPU dependent instructions. +// I suspect I don't really understand how the C macro preprocessor works but +// this seems to get the job done. // TRB 7/01 + +// BEFORE USING THESE YOU MUST SET: + +// #define SIMD_TYPE MMXEXT (or MMX or 3DNOW) + +// some macros for pavgb instruction +// V_PAVGB(mmr1, mmr2, mmr work register, smask) mmr2 may = mmrw if you can trash it + +#define V_PAVGB_MMX(mmr1, mmr2, mmrw, smask) \ + "movq "mmr2", "mmrw"\n\t" \ + "pand "smask", "mmrw"\n\t" \ + "psrlw $1, "mmrw"\n\t" \ + "pand "smask", "mmr1"\n\t" \ + "psrlw $1, "mmr1"\n\t" \ + "paddusb "mmrw", "mmr1"\n\t" +#define V_PAVGB_MMXEXT(mmr1, mmr2, mmrw, smask) "pavgb "mmr2", "mmr1"\n\t" +#define V_PAVGB_3DNOW(mmr1, mmr2, mmrw, smask) "pavgusb "mmr2", "mmr1"\n\t" +#define V_PAVGB(mmr1, mmr2, mmrw, smask) V_PAVGB2(mmr1, mmr2, mmrw, smask, SIMD_TYPE) +#define V_PAVGB2(mmr1, mmr2, mmrw, smask, simd_type) V_PAVGB3(mmr1, mmr2, mmrw, smask, simd_type) +#define V_PAVGB3(mmr1, mmr2, mmrw, smask, simd_type) V_PAVGB_##simd_type(mmr1, mmr2, mmrw, smask) + +// some macros for pmaxub instruction +#define V_PMAXUB_MMX(mmr1, mmr2) \ + "psubusb "mmr2", "mmr1"\n\t" \ + "paddusb "mmr2", "mmr1"\n\t" +#define V_PMAXUB_MMXEXT(mmr1, mmr2) "pmaxub "mmr2", "mmr1"\n\t" +#define V_PMAXUB_3DNOW(mmr1, mmr2) V_PMAXUB_MMX(mmr1, mmr2) // use MMX version +#define V_PMAXUB(mmr1, mmr2) V_PMAXUB2(mmr1, mmr2, SIMD_TYPE) +#define V_PMAXUB2(mmr1, mmr2, simd_type) V_PMAXUB3(mmr1, mmr2, simd_type) +#define V_PMAXUB3(mmr1, mmr2, simd_type) V_PMAXUB_##simd_type(mmr1, mmr2) + +// some macros for pminub instruction +// V_PMINUB(mmr1, mmr2, mmr work register) mmr2 may NOT = mmrw +#define V_PMINUB_MMX(mmr1, mmr2, mmrw) \ + "pcmpeqb "mmrw", "mmrw"\n\t" \ + "psubusb "mmr2", "mmrw"\n\t" \ + "paddusb "mmrw", "mmr1"\n\t" \ + "psubusb "mmrw", "mmr1"\n\t" +#define V_PMINUB_MMXEXT(mmr1, mmr2, mmrw) "pminub "mmr2", "mmr1"\n\t" +#define V_PMINUB_3DNOW(mmr1, mmr2, mmrw) V_PMINUB_MMX(mmr1, mmr2, mmrw) // use MMX version +#define V_PMINUB(mmr1, mmr2, mmrw) V_PMINUB2(mmr1, mmr2, mmrw, SIMD_TYPE) +#define V_PMINUB2(mmr1, mmr2, mmrw, simd_type) V_PMINUB3(mmr1, mmr2, mmrw, simd_type) +#define V_PMINUB3(mmr1, mmr2, mmrw, simd_type) V_PMINUB_##simd_type(mmr1, mmr2, mmrw) + +// some macros for movntq instruction +// V_MOVNTQ(mmr1, mmr2) +#define V_MOVNTQ_MMX(mmr1, mmr2) "movq "mmr2", "mmr1"\n\t" +#define V_MOVNTQ_3DNOW(mmr1, mmr2) "movq "mmr2", "mmr1"\n\t" +#define V_MOVNTQ_MMXEXT(mmr1, mmr2) "movntq "mmr2", "mmr1"\n\t" +#define V_MOVNTQ(mmr1, mmr2) V_MOVNTQ2(mmr1, mmr2, SIMD_TYPE) +#define V_MOVNTQ2(mmr1, mmr2, simd_type) V_MOVNTQ3(mmr1, mmr2, simd_type) +#define V_MOVNTQ3(mmr1, mmr2, simd_type) V_MOVNTQ_##simd_type(mmr1, mmr2) + +// end of macros + +#ifdef IS_SSE2 + +#define MERGE4PIXavg(PADDR1, PADDR2) \ + "movdqu "PADDR1", %%xmm0\n\t" /* our 4 pixels */ \ + "movdqu "PADDR2", %%xmm1\n\t" /* our pixel2 value */ \ + "movdqa %%xmm0, %%xmm2\n\t" /* another copy of our pixel1 value */ \ + "movdqa %%xmm1, %%xmm3\n\t" /* another copy of our pixel1 value */ \ + "psubusb %%xmm1, %%xmm2\n\t" \ + "psubusb %%xmm0, %%xmm3\n\t" \ + "por %%xmm3, %%xmm2\n\t" \ + "pavgb %%xmm1, %%xmm0\n\t" /* avg of 2 pixels */ \ + "movdqa %%xmm2, %%xmm3\n\t" /* another copy of our our weights */ \ + "pxor %%xmm1, %%xmm1\n\t" \ + "psubusb %%xmm7, %%xmm3\n\t" /* nonzero where old weights lower, else 0 */ \ + "pcmpeqb %%xmm1, %%xmm3\n\t" /* now ff where new better, else 00 */ \ + "pcmpeqb %%xmm3, %%xmm1\n\t" /* here ff where old better, else 00 */ \ + "pand %%xmm3, %%xmm0\n\t" /* keep only better new pixels */ \ + "pand %%xmm3, %%xmm2\n\t" /* and weights */ \ + "pand %%xmm1, %%xmm5\n\t" /* keep only better old pixels */ \ + "pand %%xmm1, %%xmm7\n\t" \ + "por %%xmm0, %%xmm5\n\t" /* and merge new & old vals */ \ + "por %%xmm2, %%xmm7\n\t" + +#define MERGE4PIXavgH(PADDR1A, PADDR1B, PADDR2A, PADDR2B) \ + "movdqu "PADDR1A", %%xmm0\n\t" /* our 4 pixels */ \ + "movdqu "PADDR2A", %%xmm1\n\t" /* our pixel2 value */ \ + "movdqu "PADDR1B", %%xmm2\n\t" /* our 4 pixels */ \ + "movdqu "PADDR2B", %%xmm3\n\t" /* our pixel2 value */ \ + "pavgb %%xmm2, %%xmm0\n\t" \ + "pavgb %%xmm3, %%xmm1\n\t" \ + "movdqa %%xmm0, %%xmm2\n\t" /* another copy of our pixel1 value */ \ + "movdqa %%xmm1, %%xmm3\n\t" /* another copy of our pixel1 value */ \ + "psubusb %%xmm1, %%xmm2\n\t" \ + "psubusb %%xmm0, %%xmm3\n\t" \ + "por %%xmm3, %%xmm2\n\t" \ + "pavgb %%xmm1, %%xmm0\n\t" /* avg of 2 pixels */ \ + "movdqa %%xmm2, %%xmm3\n\t" /* another copy of our our weights */ \ + "pxor %%xmm1, %%xmm1\n\t" \ + "psubusb %%xmm7, %%xmm3\n\t" /* nonzero where old weights lower, else 0 */ \ + "pcmpeqb %%xmm1, %%xmm3\n\t" /* now ff where new better, else 00 */ \ + "pcmpeqb %%xmm3, %%xmm1\n\t" /* here ff where old better, else 00 */ \ + "pand %%xmm3, %%xmm0\n\t" /* keep only better new pixels */ \ + "pand %%xmm3, %%xmm2\n\t" /* and weights */ \ + "pand %%xmm1, %%xmm5\n\t" /* keep only better old pixels */ \ + "pand %%xmm1, %%xmm7\n\t" \ + "por %%xmm0, %%xmm5\n\t" /* and merge new & old vals */ \ + "por %%xmm2, %%xmm7\n\t" + +#define RESET_CHROMA "por "_UVMask", %%xmm7\n\t" + +#else // ifdef IS_SSE2 + +#define MERGE4PIXavg(PADDR1, PADDR2) \ + "movq "PADDR1", %%mm0\n\t" /* our 4 pixels */ \ + "movq "PADDR2", %%mm1\n\t" /* our pixel2 value */ \ + "movq %%mm0, %%mm2\n\t" /* another copy of our pixel1 value */ \ + "movq %%mm1, %%mm3\n\t" /* another copy of our pixel1 value */ \ + "psubusb %%mm1, %%mm2\n\t" \ + "psubusb %%mm0, %%mm3\n\t" \ + "por %%mm3, %%mm2\n\t" \ + V_PAVGB ("%%mm0", "%%mm1", "%%mm3", _ShiftMask) /* avg of 2 pixels */ \ + "movq %%mm2, %%mm3\n\t" /* another copy of our our weights */ \ + "pxor %%mm1, %%mm1\n\t" \ + "psubusb %%mm7, %%mm3\n\t" /* nonzero where old weights lower, else 0 */ \ + "pcmpeqb %%mm1, %%mm3\n\t" /* now ff where new better, else 00 */ \ + "pcmpeqb %%mm3, %%mm1\n\t" /* here ff where old better, else 00 */ \ + "pand %%mm3, %%mm0\n\t" /* keep only better new pixels */ \ + "pand %%mm3, %%mm2\n\t" /* and weights */ \ + "pand %%mm1, %%mm5\n\t" /* keep only better old pixels */ \ + "pand %%mm1, %%mm7\n\t" \ + "por %%mm0, %%mm5\n\t" /* and merge new & old vals */ \ + "por %%mm2, %%mm7\n\t" + +#define MERGE4PIXavgH(PADDR1A, PADDR1B, PADDR2A, PADDR2B) \ + "movq "PADDR1A", %%mm0\n\t" /* our 4 pixels */ \ + "movq "PADDR2A", %%mm1\n\t" /* our pixel2 value */ \ + "movq "PADDR1B", %%mm2\n\t" /* our 4 pixels */ \ + "movq "PADDR2B", %%mm3\n\t" /* our pixel2 value */ \ + V_PAVGB("%%mm0", "%%mm2", "%%mm2", _ShiftMask) \ + V_PAVGB("%%mm1", "%%mm3", "%%mm3", _ShiftMask) \ + "movq %%mm0, %%mm2\n\t" /* another copy of our pixel1 value */ \ + "movq %%mm1, %%mm3\n\t" /* another copy of our pixel1 value */ \ + "psubusb %%mm1, %%mm2\n\t" \ + "psubusb %%mm0, %%mm3\n\t" \ + "por %%mm3, %%mm2\n\t" \ + V_PAVGB("%%mm0", "%%mm1", "%%mm3", _ShiftMask) /* avg of 2 pixels */ \ + "movq %%mm2, %%mm3\n\t" /* another copy of our our weights */ \ + "pxor %%mm1, %%mm1\n\t" \ + "psubusb %%mm7, %%mm3\n\t" /* nonzero where old weights lower, else 0 */ \ + "pcmpeqb %%mm1, %%mm3\n\t" /* now ff where new better, else 00 */ \ + "pcmpeqb %%mm3, %%mm1\n\t" /* here ff where old better, else 00 */ \ + "pand %%mm3, %%mm0\n\t" /* keep only better new pixels */ \ + "pand %%mm3, %%mm2\n\t" /* and weights */ \ + "pand %%mm1, %%mm5\n\t" /* keep only better old pixels */ \ + "pand %%mm1, %%mm7\n\t" \ + "por %%mm0, %%mm5\n\t" /* and merge new & old vals */ \ + "por %%mm2, %%mm7\n\t" + +#define RESET_CHROMA "por "_UVMask", %%mm7\n\t" + +#endif + + diff --git a/gst/deinterlace/tvtime/vfir.c b/gst/deinterlace/tvtime/vfir.c new file mode 100644 index 00000000..b3ebaae1 --- /dev/null +++ b/gst/deinterlace/tvtime/vfir.c @@ -0,0 +1,187 @@ +/* + * + * GStreamer + * Copyright (C) 2004 Billy Biggs <vektor@dumbterm.net> + * Copyright (c) 2001, 2002, 2003 Fabrice Bellard. + * Copyright (C) 2008 Sebastian Dröge <slomo@collabora.co.uk> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public + * License along with this library; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + */ + +/* + * This file contains code from ffmpeg, see http://ffmpeg.org/ (LGPL) + * and modifications by Billy Biggs. + * + * Relicensed for GStreamer from GPL to LGPL with permit from Billy Biggs. + * See: http://bugzilla.gnome.org/show_bug.cgi?id=163578 + */ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include "_stdint.h" +#include "gstdeinterlace.h" +#include <string.h> + +#define GST_TYPE_DEINTERLACE_METHOD_VFIR (gst_deinterlace_method_vfir_get_type ()) +#define GST_IS_DEINTERLACE_METHOD_VFIR(obj) (G_TYPE_CHECK_INSTANCE_TYPE ((obj), GST_TYPE_DEINTERLACE_METHOD_VFIR)) +#define GST_IS_DEINTERLACE_METHOD_VFIR_CLASS(klass) (G_TYPE_CHECK_CLASS_TYPE ((klass), GST_TYPE_DEINTERLACE_METHOD_VFIR)) +#define GST_DEINTERLACE_METHOD_VFIR_GET_CLASS(obj) (G_TYPE_INSTANCE_GET_CLASS ((obj), GST_TYPE_DEINTERLACE_METHOD_VFIR, GstDeinterlaceMethodVFIRClass)) +#define GST_DEINTERLACE_METHOD_VFIR(obj) (G_TYPE_CHECK_INSTANCE_CAST ((obj), GST_TYPE_DEINTERLACE_METHOD_VFIR, GstDeinterlaceMethodVFIR)) +#define GST_DEINTERLACE_METHOD_VFIR_CLASS(klass) (G_TYPE_CHECK_CLASS_CAST ((klass), GST_TYPE_DEINTERLACE_METHOD_VFIR, GstDeinterlaceMethodVFIRClass)) +#define GST_DEINTERLACE_METHOD_VFIR_CAST(obj) ((GstDeinterlaceMethodVFIR*)(obj)) + +GType gst_deinterlace_method_vfir_get_type (void); + +typedef GstDeinterlaceSimpleMethod GstDeinterlaceMethodVFIR; + +typedef GstDeinterlaceSimpleMethodClass GstDeinterlaceMethodVFIRClass; + +/* + * The MPEG2 spec uses a slightly harsher filter, they specify + * [-1 8 2 8 -1]. ffmpeg uses a similar filter but with more of + * a tendancy to blur than to use the local information. The + * filter taps here are: [-1 4 2 4 -1]. + */ + +/** + * C implementation. + */ +static inline void +deinterlace_line_c (GstDeinterlaceMethod * self, GstDeinterlace * parent, + guint8 * dst, GstDeinterlaceScanlineData * scanlines, gint width) +{ + gint sum; + guint8 *lum_m4 = scanlines->tt1; + guint8 *lum_m3 = scanlines->t0; + guint8 *lum_m2 = scanlines->m1; + guint8 *lum_m1 = scanlines->b0; + guint8 *lum = scanlines->bb1; + gint size = width * 2; + + for (; size >= 0; size--) { + sum = -lum_m4[0]; + sum += lum_m3[0] << 2; + sum += lum_m2[0] << 1; + sum += lum_m1[0] << 2; + sum += -lum[0]; + dst[0] = (sum + 4) >> 3; // This needs to be clipped at 0 and 255: cm[(sum + 4) >> 3]; + lum_m4++; + lum_m3++; + lum_m2++; + lum_m1++; + lum++; + dst++; + } +} + +#ifdef BUILD_X86_ASM +#include "mmx.h" +static void +deinterlace_line_mmx (GstDeinterlaceMethod * self, GstDeinterlace * parent, + guint8 * dst, GstDeinterlaceScanlineData * scanlines, gint width) +{ + mmx_t rounder; + guint8 *lum_m4 = scanlines->tt1; + guint8 *lum_m3 = scanlines->t0; + guint8 *lum_m2 = scanlines->m1; + guint8 *lum_m1 = scanlines->b0; + guint8 *lum = scanlines->bb1; + + rounder.uw[0] = 4; + rounder.uw[1] = 4; + rounder.uw[2] = 4; + rounder.uw[3] = 4; + pxor_r2r (mm7, mm7); + movq_m2r (rounder, mm6); + + for (; width > 1; width -= 2) { + movd_m2r (*lum_m4, mm0); + movd_m2r (*lum_m3, mm1); + movd_m2r (*lum_m2, mm2); + movd_m2r (*lum_m1, mm3); + movd_m2r (*lum, mm4); + punpcklbw_r2r (mm7, mm0); + punpcklbw_r2r (mm7, mm1); + punpcklbw_r2r (mm7, mm2); + punpcklbw_r2r (mm7, mm3); + punpcklbw_r2r (mm7, mm4); + paddw_r2r (mm3, mm1); + psllw_i2r (1, mm2); + paddw_r2r (mm4, mm0); + psllw_i2r (2, mm1); // 2 + paddw_r2r (mm6, mm2); + paddw_r2r (mm2, mm1); + psubusw_r2r (mm0, mm1); + psrlw_i2r (3, mm1); // 3 + packuswb_r2r (mm7, mm1); + movd_r2m (mm1, *dst); + lum_m4 += 4; + lum_m3 += 4; + lum_m2 += 4; + lum_m1 += 4; + lum += 4; + dst += 4; + } + emms (); + + /* Handle odd widths */ + if (width > 0) { + scanlines->tt1 = lum_m4; + scanlines->t0 = lum_m3; + scanlines->m1 = lum_m2; + scanlines->b0 = lum_m1; + scanlines->bb1 = lum; + + deinterlace_line_c (self, parent, dst, scanlines, width); + } +} +#endif + +G_DEFINE_TYPE (GstDeinterlaceMethodVFIR, gst_deinterlace_method_vfir, + GST_TYPE_DEINTERLACE_SIMPLE_METHOD); + +static void +gst_deinterlace_method_vfir_class_init (GstDeinterlaceMethodVFIRClass * klass) +{ + GstDeinterlaceMethodClass *dim_class = (GstDeinterlaceMethodClass *) klass; + GstDeinterlaceSimpleMethodClass *dism_class = + (GstDeinterlaceSimpleMethodClass *) klass; +#ifdef BUILD_X86_ASM + guint cpu_flags = oil_cpu_get_flags (); +#endif + + dim_class->fields_required = 2; + dim_class->name = "Blur Vertical"; + dim_class->nick = "vfir"; + dim_class->latency = 0; + +#ifdef BUILD_X86_ASM + if (cpu_flags & OIL_IMPL_FLAG_MMX) { + dism_class->interpolate_scanline = deinterlace_line_mmx; + } else { + dism_class->interpolate_scanline = deinterlace_line_c; + } +#else + dism_class->interpolate_scanline = deinterlace_line_c; +#endif +} + +static void +gst_deinterlace_method_vfir_init (GstDeinterlaceMethodVFIR * self) +{ +} diff --git a/gst/deinterlace/tvtime/weave.c b/gst/deinterlace/tvtime/weave.c new file mode 100644 index 00000000..1a86170e --- /dev/null +++ b/gst/deinterlace/tvtime/weave.c @@ -0,0 +1,82 @@ +/** + * Weave frames + * Copyright (C) 2002 Billy Biggs <vektor@dumbterm.net>. + * Copyright (C) 2008 Sebastian Dröge <sebastian.droege@collabora.co.uk> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + */ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include "_stdint.h" +#include "gstdeinterlace.h" +#include <string.h> + +#define GST_TYPE_DEINTERLACE_METHOD_WEAVE (gst_deinterlace_method_weave_get_type ()) +#define GST_IS_DEINTERLACE_METHOD_WEAVE(obj) (G_TYPE_CHECK_INSTANCE_TYPE ((obj), GST_TYPE_DEINTERLACE_METHOD_WEAVE)) +#define GST_IS_DEINTERLACE_METHOD_WEAVE_CLASS(klass) (G_TYPE_CHECK_CLASS_TYPE ((klass), GST_TYPE_DEINTERLACE_METHOD_WEAVE)) +#define GST_DEINTERLACE_METHOD_WEAVE_GET_CLASS(obj) (G_TYPE_INSTANCE_GET_CLASS ((obj), GST_TYPE_DEINTERLACE_METHOD_WEAVE, GstDeinterlaceMethodWeaveClass)) +#define GST_DEINTERLACE_METHOD_WEAVE(obj) (G_TYPE_CHECK_INSTANCE_CAST ((obj), GST_TYPE_DEINTERLACE_METHOD_WEAVE, GstDeinterlaceMethodWeave)) +#define GST_DEINTERLACE_METHOD_WEAVE_CLASS(klass) (G_TYPE_CHECK_CLASS_CAST ((klass), GST_TYPE_DEINTERLACE_METHOD_WEAVE, GstDeinterlaceMethodWeaveClass)) +#define GST_DEINTERLACE_METHOD_WEAVE_CAST(obj) ((GstDeinterlaceMethodWeave*)(obj)) + +GType gst_deinterlace_method_weave_get_type (void); + +typedef GstDeinterlaceSimpleMethod GstDeinterlaceMethodWeave; + +typedef GstDeinterlaceSimpleMethodClass GstDeinterlaceMethodWeaveClass; + + +static void +deinterlace_scanline_weave (GstDeinterlaceMethod * self, + GstDeinterlace * parent, guint8 * out, + GstDeinterlaceScanlineData * scanlines, gint width) +{ + oil_memcpy (out, scanlines->m1, parent->row_stride); +} + +static void +copy_scanline (GstDeinterlaceMethod * self, GstDeinterlace * parent, + guint8 * out, GstDeinterlaceScanlineData * scanlines, gint width) +{ + oil_memcpy (out, scanlines->m0, parent->row_stride); +} + +G_DEFINE_TYPE (GstDeinterlaceMethodWeave, gst_deinterlace_method_weave, + GST_TYPE_DEINTERLACE_SIMPLE_METHOD); + +static void +gst_deinterlace_method_weave_class_init (GstDeinterlaceMethodWeaveClass * klass) +{ + GstDeinterlaceMethodClass *dim_class = (GstDeinterlaceMethodClass *) klass; + GstDeinterlaceSimpleMethodClass *dism_class = + (GstDeinterlaceSimpleMethodClass *) klass; + + dim_class->fields_required = 2; + dim_class->name = "Weave"; + dim_class->nick = "weave"; + dim_class->latency = 0; + + dism_class->interpolate_scanline = deinterlace_scanline_weave; + dism_class->copy_scanline = copy_scanline; +} + +static void +gst_deinterlace_method_weave_init (GstDeinterlaceMethodWeave * self) +{ +} diff --git a/gst/deinterlace/tvtime/weavebff.c b/gst/deinterlace/tvtime/weavebff.c new file mode 100644 index 00000000..eb983cf2 --- /dev/null +++ b/gst/deinterlace/tvtime/weavebff.c @@ -0,0 +1,88 @@ +/** + * Weave frames, bottom-field-first. + * Copyright (C) 2003 Billy Biggs <vektor@dumbterm.net>. + * Copyright (C) 2008 Sebastian Dröge <sebastian.droege@collabora.co.uk> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + */ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include "_stdint.h" +#include "gstdeinterlace.h" +#include <string.h> + +#define GST_TYPE_DEINTERLACE_METHOD_WEAVE_BFF (gst_deinterlace_method_weave_bff_get_type ()) +#define GST_IS_DEINTERLACE_METHOD_WEAVE_BFF(obj) (G_TYPE_CHECK_INSTANCE_TYPE ((obj), GST_TYPE_DEINTERLACE_METHOD_WEAVE_BFF)) +#define GST_IS_DEINTERLACE_METHOD_WEAVE_BFF_CLASS(klass) (G_TYPE_CHECK_CLASS_TYPE ((klass), GST_TYPE_DEINTERLACE_METHOD_WEAVE_BFF)) +#define GST_DEINTERLACE_METHOD_WEAVE_BFF_GET_CLASS(obj) (G_TYPE_INSTANCE_GET_CLASS ((obj), GST_TYPE_DEINTERLACE_METHOD_WEAVE_BFF, GstDeinterlaceMethodWeaveBFFClass)) +#define GST_DEINTERLACE_METHOD_WEAVE_BFF(obj) (G_TYPE_CHECK_INSTANCE_CAST ((obj), GST_TYPE_DEINTERLACE_METHOD_WEAVE_BFF, GstDeinterlaceMethodWeaveBFF)) +#define GST_DEINTERLACE_METHOD_WEAVE_BFF_CLASS(klass) (G_TYPE_CHECK_CLASS_CAST ((klass), GST_TYPE_DEINTERLACE_METHOD_WEAVE_BFF, GstDeinterlaceMethodWeaveBFFClass)) +#define GST_DEINTERLACE_METHOD_WEAVE_BFF_CAST(obj) ((GstDeinterlaceMethodWeaveBFF*)(obj)) + +GType gst_deinterlace_method_weave_bff_get_type (void); + +typedef GstDeinterlaceSimpleMethod GstDeinterlaceMethodWeaveBFF; + +typedef GstDeinterlaceSimpleMethodClass GstDeinterlaceMethodWeaveBFFClass; + + +static void +deinterlace_scanline_weave (GstDeinterlaceMethod * self, + GstDeinterlace * parent, guint8 * out, + GstDeinterlaceScanlineData * scanlines, gint width) +{ + oil_memcpy (out, scanlines->m1, parent->row_stride); +} + +static void +copy_scanline (GstDeinterlaceMethod * self, GstDeinterlace * parent, + guint8 * out, GstDeinterlaceScanlineData * scanlines, gint width) +{ + /* FIXME: original code used m2 and m0 but this looks really bad */ + if (scanlines->bottom_field) { + oil_memcpy (out, scanlines->bb2, parent->row_stride); + } else { + oil_memcpy (out, scanlines->bb0, parent->row_stride); + } +} + +G_DEFINE_TYPE (GstDeinterlaceMethodWeaveBFF, gst_deinterlace_method_weave_bff, + GST_TYPE_DEINTERLACE_SIMPLE_METHOD); + +static void +gst_deinterlace_method_weave_bff_class_init (GstDeinterlaceMethodWeaveBFFClass * + klass) +{ + GstDeinterlaceMethodClass *dim_class = (GstDeinterlaceMethodClass *) klass; + GstDeinterlaceSimpleMethodClass *dism_class = + (GstDeinterlaceSimpleMethodClass *) klass; + + dim_class->fields_required = 3; + dim_class->name = "Progressive: Bottom Field First"; + dim_class->nick = "weavebff"; + dim_class->latency = 0; + + dism_class->interpolate_scanline = deinterlace_scanline_weave; + dism_class->copy_scanline = copy_scanline; +} + +static void +gst_deinterlace_method_weave_bff_init (GstDeinterlaceMethodWeaveBFF * self) +{ +} diff --git a/gst/deinterlace/tvtime/weavetff.c b/gst/deinterlace/tvtime/weavetff.c new file mode 100644 index 00000000..4885b63b --- /dev/null +++ b/gst/deinterlace/tvtime/weavetff.c @@ -0,0 +1,88 @@ +/** + * Weave frames, top-field-first. + * Copyright (C) 2003 Billy Biggs <vektor@dumbterm.net>. + * Copyright (C) 2008 Sebastian Dröge <sebastian.droege@collabora.co.uk> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + */ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include "_stdint.h" +#include "gstdeinterlace.h" +#include <string.h> + +#define GST_TYPE_DEINTERLACE_METHOD_WEAVE_TFF (gst_deinterlace_method_weave_tff_get_type ()) +#define GST_IS_DEINTERLACE_METHOD_WEAVE_TFF(obj) (G_TYPE_CHECK_INSTANCE_TYPE ((obj), GST_TYPE_DEINTERLACE_METHOD_WEAVE_TFF)) +#define GST_IS_DEINTERLACE_METHOD_WEAVE_TFF_CLASS(klass) (G_TYPE_CHECK_CLASS_TYPE ((klass), GST_TYPE_DEINTERLACE_METHOD_WEAVE_TFF)) +#define GST_DEINTERLACE_METHOD_WEAVE_TFF_GET_CLASS(obj) (G_TYPE_INSTANCE_GET_CLASS ((obj), GST_TYPE_DEINTERLACE_METHOD_WEAVE_TFF, GstDeinterlaceMethodWeaveTFFClass)) +#define GST_DEINTERLACE_METHOD_WEAVE_TFF(obj) (G_TYPE_CHECK_INSTANCE_CAST ((obj), GST_TYPE_DEINTERLACE_METHOD_WEAVE_TFF, GstDeinterlaceMethodWeaveTFF)) +#define GST_DEINTERLACE_METHOD_WEAVE_TFF_CLASS(klass) (G_TYPE_CHECK_CLASS_CAST ((klass), GST_TYPE_DEINTERLACE_METHOD_WEAVE_TFF, GstDeinterlaceMethodWeaveTFFClass)) +#define GST_DEINTERLACE_METHOD_WEAVE_TFF_CAST(obj) ((GstDeinterlaceMethodWeaveTFF*)(obj)) + +GType gst_deinterlace_method_weave_tff_get_type (void); + +typedef GstDeinterlaceSimpleMethod GstDeinterlaceMethodWeaveTFF; + +typedef GstDeinterlaceSimpleMethodClass GstDeinterlaceMethodWeaveTFFClass; + + +static void +deinterlace_scanline_weave (GstDeinterlaceMethod * self, + GstDeinterlace * parent, guint8 * out, + GstDeinterlaceScanlineData * scanlines, gint width) +{ + oil_memcpy (out, scanlines->m1, parent->row_stride); +} + +static void +copy_scanline (GstDeinterlaceMethod * self, GstDeinterlace * parent, + guint8 * out, GstDeinterlaceScanlineData * scanlines, gint width) +{ + /* FIXME: original code used m2 and m0 but this looks really bad */ + if (scanlines->bottom_field) { + oil_memcpy (out, scanlines->bb0, parent->row_stride); + } else { + oil_memcpy (out, scanlines->bb2, parent->row_stride); + } +} + +G_DEFINE_TYPE (GstDeinterlaceMethodWeaveTFF, gst_deinterlace_method_weave_tff, + GST_TYPE_DEINTERLACE_SIMPLE_METHOD); + +static void +gst_deinterlace_method_weave_tff_class_init (GstDeinterlaceMethodWeaveTFFClass * + klass) +{ + GstDeinterlaceMethodClass *dim_class = (GstDeinterlaceMethodClass *) klass; + GstDeinterlaceSimpleMethodClass *dism_class = + (GstDeinterlaceSimpleMethodClass *) klass; + + dim_class->fields_required = 3; + dim_class->name = "Progressive: Top Field First"; + dim_class->nick = "weavetff"; + dim_class->latency = 0; + + dism_class->interpolate_scanline = deinterlace_scanline_weave; + dism_class->copy_scanline = copy_scanline; +} + +static void +gst_deinterlace_method_weave_tff_init (GstDeinterlaceMethodWeaveTFF * self) +{ +} diff --git a/gst/deinterlace/tvtime/x86-64_macros.inc b/gst/deinterlace/tvtime/x86-64_macros.inc new file mode 100644 index 00000000..2e9df758 --- /dev/null +++ b/gst/deinterlace/tvtime/x86-64_macros.inc @@ -0,0 +1,82 @@ +/* + * + * GStreamer + * Copyright (C) 2004 Dirk Ziegelmeier <dziegel@gmx.de> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public + * License along with this library; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + */ + +/* + * + * See: http://bugzilla.gnome.org/show_bug.cgi?id=163578 + */ + +/* + * This file is copied from TVTIME's sources. + * Original author: Achim Schneider <batchall@mordor.ch> + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#ifndef XAX + +#if defined (HAVE_CPU_I386) && !defined(HAVE_CPU_X86_64) + +#define XAX "eax" +#define XBX "ebx" +#define XCX "ecx" +#define XDX "edx" +#define XSI "esi" +#define XDI "edi" +#define XSP "esp" +#define MOVX "movl" +#define LEAX "leal" +#define DECX "decl" +#define PUSHX "pushl" +#define POPX "popl" +#define CMPX "cmpl" +#define ADDX "addl" +#define SHLX "shll" +#define SHRX "shrl" +#define SUBX "subl" + +#elif defined (HAVE_CPU_X86_64) + +#define XAX "rax" +#define XBX "rbx" +#define XCX "rcx" +#define XDX "rdx" +#define XSI "rsi" +#define XDI "rdi" +#define XSP "rsp" +#define MOVX "movq" +#define LEAX "leaq" +#define DECX "decq" +#define PUSHX "pushq" +#define POPX "popq" +#define CMPX "cmpq" +#define ADDX "addq" +#define SHLX "shlq" +#define SHRX "shrq" +#define SUBX "subq" + +#else +#error Undefined architecture. Define either ARCH_X86 or ARCH_X86_64. +#endif + +#endif |