[OBSOLETE] SIMD IRRLICHT VECTORS!

Post those lines of code you feel like sharing or find what you require for your project here; or simply use them as tutorials.
kklouzal
Posts: 343
Joined: Sun Mar 28, 2010 8:14 pm
Location: USA - Arizona

Re: WANT 4x SPEEDUPS on CPU-side CODE??? SIMD IRRLICHT VECTO

Post by kklouzal »

I read the entire article, its all very interesting however note the section that defines the compiler flags:

Code: Select all

CC command line:
gcc -O2 -msse test.c -S -o test.asm
 
MSVC command line:
cl  /O2 /arch:SSE /c /FA test.c
 
ICC’s command line:
icc -O2 -msse test.c -S -o test.asm
The author did not take advantage of every available optimization flag therefore where GCC may be a superior compiler, it is not far superior as the article would lead you to believe. Also, (/GS-) is enabled by default which is something that would most definitely increase the complexity of the underlying ASM at the advantage of increasing security.

Take that article with a grain of salt, the author in my opinion did a great job explaining the ASM that was generated but did not explore all the possible compiler flags to truly make the tests between the different compilers fair and equal. Too much assumption is placed around the single (/O2) computational flag.
Dream Big Or Go Home.
Help Me Help You.
devsh
Competition winner
Posts: 2057
Joined: Tue Dec 09, 2008 6:00 pm
Location: UK
Contact:

Re: WANT 4x SPEEDUPS on CPU-side CODE??? SIMD IRRLICHT VECTO

Post by devsh »

to be fair if you're writing SSE and your code needs it that badly... you may want to use the O3 flag first XD

P.S. found a bug in the vector class which caused it to completely not work! AND I FOUND A WAY TO be able to use new and delete on SIMD classes requiring 16byte alignment
devsh
Competition winner
Posts: 2057
Joined: Tue Dec 09, 2008 6:00 pm
Location: UK
Contact:

Re: WANT 4x SPEEDUPS on CPU-side CODE??? SIMD IRRLICHT VECTO

Post by devsh »

I fixed up irrMath.h which was using some nasty inline assembly

Code: Select all

// Copyright (C) 2002-2012 Nikolaus Gebhardt
// This file is part of the "Irrlicht Engine".
// For conditions of distribution and use, see copyright notice in irrlicht.h
 
#ifndef __IRR_MATH_H_INCLUDED__
#define __IRR_MATH_H_INCLUDED__
 
#include "IrrCompileConfig.h"
#include "irrTypes.h"
#include <math.h>
#include <float.h>
#include <stdlib.h> // for abs() etc.
#include <limits.h> // For INT_MAX / UINT_MAX
 
#if defined(_IRR_SOLARIS_PLATFORM_) || defined(__BORLANDC__) || defined (__BCPLUSPLUS__) || defined (_WIN32_WCE)
    #define sqrtf(X) (irr::f32)sqrt((irr::f64)(X))
    #define sinf(X) (irr::f32)sin((irr::f64)(X))
    #define cosf(X) (irr::f32)cos((irr::f64)(X))
    #define asinf(X) (irr::f32)asin((irr::f64)(X))
    #define acosf(X) (irr::f32)acos((irr::f64)(X))
    #define atan2f(X,Y) (irr::f32)atan2((irr::f64)(X),(irr::f64)(Y))
    #define ceilf(X) (irr::f32)ceil((irr::f64)(X))
    #define floorf(X) (irr::f32)floor((irr::f64)(X))
    #define powf(X,Y) (irr::f32)pow((irr::f64)(X),(irr::f64)(Y))
    #define fmodf(X,Y) (irr::f32)fmod((irr::f64)(X),(irr::f64)(Y))
    #define fabsf(X) (irr::f32)fabs((irr::f64)(X))
    #define logf(X) (irr::f32)log((irr::f64)(X))
#endif
 
#ifndef FLT_MAX
#define FLT_MAX 3.402823466E+38F
#endif
 
#ifndef FLT_MIN
#define FLT_MIN 1.17549435e-38F
#endif
 
namespace irr
{
namespace core
{
 
    //! Rounding error constant often used when comparing f32 values.
 
    const s32 ROUNDING_ERROR_S32 = 0;
#ifdef __IRR_HAS_S64
    const s64 ROUNDING_ERROR_S64 = 0;
#endif
    const f32 ROUNDING_ERROR_f32 = 0.000001f;
    const f64 ROUNDING_ERROR_f64 = 0.00000001;
 
#ifdef PI // make sure we don't collide with a define
#undef PI
#endif
    //! Constant for PI.
    const f32 PI        = 3.14159265359f;
 
    //! Constant for reciprocal of PI.
    const f32 RECIPROCAL_PI = 1.0f/PI;
 
    //! Constant for half of PI.
    const f32 HALF_PI   = PI/2.0f;
 
#ifdef PI64 // make sure we don't collide with a define
#undef PI64
#endif
    //! Constant for 64bit PI.
    const f64 PI64      = 3.1415926535897932384626433832795028841971693993751;
 
    //! Constant for 64bit reciprocal of PI.
    const f64 RECIPROCAL_PI64 = 1.0/PI64;
 
    //! 32bit Constant for converting from degrees to radians
    const f32 DEGTORAD = PI / 180.0f;
 
    //! 32bit constant for converting from radians to degrees (formally known as GRAD_PI)
    const f32 RADTODEG   = 180.0f / PI;
 
    //! 64bit constant for converting from degrees to radians (formally known as GRAD_PI2)
    const f64 DEGTORAD64 = PI64 / 180.0;
 
    //! 64bit constant for converting from radians to degrees
    const f64 RADTODEG64 = 180.0 / PI64;
 
    //! Utility function to convert a radian value to degrees
    /** Provided as it can be clearer to write radToDeg(X) than RADTODEG * X
    \param radians  The radians value to convert to degrees.
    */
    inline f32 radToDeg(f32 radians)
    {
        return RADTODEG * radians;
    }
 
    //! Utility function to convert a radian value to degrees
    /** Provided as it can be clearer to write radToDeg(X) than RADTODEG * X
    \param radians  The radians value to convert to degrees.
    */
    inline f64 radToDeg(f64 radians)
    {
        return RADTODEG64 * radians;
    }
 
    //! Utility function to convert a degrees value to radians
    /** Provided as it can be clearer to write degToRad(X) than DEGTORAD * X
    \param degrees  The degrees value to convert to radians.
    */
    inline f32 degToRad(f32 degrees)
    {
        return DEGTORAD * degrees;
    }
 
    //! Utility function to convert a degrees value to radians
    /** Provided as it can be clearer to write degToRad(X) than DEGTORAD * X
    \param degrees  The degrees value to convert to radians.
    */
    inline f64 degToRad(f64 degrees)
    {
        return DEGTORAD64 * degrees;
    }
 
    //! returns minimum of two values. Own implementation to get rid of the STL (VS6 problems)
    template<class T>
    inline const T& min_(const T& a, const T& b)
    {
        return a < b ? a : b;
    }
 
    //! returns minimum of three values. Own implementation to get rid of the STL (VS6 problems)
    template<class T>
    inline const T& min_(const T& a, const T& b, const T& c)
    {
        return a < b ? min_(a, c) : min_(b, c);
    }
 
    //! returns maximum of two values. Own implementation to get rid of the STL (VS6 problems)
    template<class T>
    inline const T& max_(const T& a, const T& b)
    {
        return a < b ? b : a;
    }
 
    //! returns maximum of three values. Own implementation to get rid of the STL (VS6 problems)
    template<class T>
    inline const T& max_(const T& a, const T& b, const T& c)
    {
        return a < b ? max_(b, c) : max_(a, c);
    }
 
    //! returns abs of two values. Own implementation to get rid of STL (VS6 problems)
    template<class T>
    inline T abs_(const T& a)
    {
        return a < (T)0 ? -a : a;
    }
 
    //! returns linear interpolation of a and b with ratio t
    //! \return: a if t==0, b if t==1, and the linear interpolation else
    template<class T>
    inline T lerp(const T& a, const T& b, const f32 t)
    {
        return (T)(a*(1.f-t)) + (b*t);
    }
 
    //! clamps a value between low and high
    template <class T>
    inline const T clamp (const T& value, const T& low, const T& high)
    {
        return min_ (max_(value,low), high);
    }
 
    //! swaps the content of the passed parameters
    // Note: We use the same trick as boost and use two template arguments to
    // avoid ambiguity when swapping objects of an Irrlicht type that has not
    // it's own swap overload. Otherwise we get conflicts with some compilers
    // in combination with stl.
    template <class T1, class T2>
    inline void swap(T1& a, T2& b)
    {
        T1 c(a);
        a = b;
        b = c;
    }
 
    //! returns if a equals b, taking possible rounding errors into account
    inline bool equals(const f64 a, const f64 b, const f64 tolerance = ROUNDING_ERROR_f64)
    {
        return (a + tolerance >= b) && (a - tolerance <= b);
    }
 
    //! returns if a equals b, taking possible rounding errors into account
    inline bool equals(const f32 a, const f32 b, const f32 tolerance = ROUNDING_ERROR_f32)
    {
        return (a + tolerance >= b) && (a - tolerance <= b);
    }
 
    union FloatIntUnion32
    {
        FloatIntUnion32(float f1 = 0.0f) : f(f1) {}
        // Portable sign-extraction
        bool sign() const { return (i >> 31) != 0; }
 
        irr::s32 i;
        irr::f32 f;
    };
 
    //! We compare the difference in ULP's (spacing between floating-point numbers, aka ULP=1 means there exists no float between).
    //\result true when numbers have a ULP <= maxUlpDiff AND have the same sign.
    inline bool equalsByUlp(f32 a, f32 b, int maxUlpDiff)
    {
        // Based on the ideas and code from Bruce Dawson on
        // http://www.altdevblogaday.com/2012/02/22/comparing-floating-point-numbers-2012-edition/
        // When floats are interpreted as integers the two nearest possible float numbers differ just
        // by one integer number. Also works the other way round, an integer of 1 interpreted as float
        // is for example the smallest possible float number.
 
        FloatIntUnion32 fa(a);
        FloatIntUnion32 fb(b);
 
        // Different signs, we could maybe get difference to 0, but so close to 0 using epsilons is better.
        if ( fa.sign() != fb.sign() )
        {
            // Check for equality to make sure +0==-0
            if (fa.i == fb.i)
                return true;
            return false;
        }
 
        // Find the difference in ULPs.
        int ulpsDiff = abs_(fa.i- fb.i);
        if (ulpsDiff <= maxUlpDiff)
            return true;
 
        return false;
    }
 
#if 0
    //! returns if a equals b, not using any rounding tolerance
    inline bool equals(const s32 a, const s32 b)
    {
        return (a == b);
    }
 
    //! returns if a equals b, not using any rounding tolerance
    inline bool equals(const u32 a, const u32 b)
    {
        return (a == b);
    }
#endif
    //! returns if a equals b, taking an explicit rounding tolerance into account
    inline bool equals(const s32 a, const s32 b, const s32 tolerance = ROUNDING_ERROR_S32)
    {
        return (a + tolerance >= b) && (a - tolerance <= b);
    }
 
    //! returns if a equals b, taking an explicit rounding tolerance into account
    inline bool equals(const u32 a, const u32 b, const s32 tolerance = ROUNDING_ERROR_S32)
    {
        return (a + tolerance >= b) && (a - tolerance <= b);
    }
 
#ifdef __IRR_HAS_S64
    //! returns if a equals b, taking an explicit rounding tolerance into account
    inline bool equals(const s64 a, const s64 b, const s64 tolerance = ROUNDING_ERROR_S64)
    {
        return (a + tolerance >= b) && (a - tolerance <= b);
    }
#endif
 
    //! returns if a equals zero, taking rounding errors into account
    inline bool iszero(const f64 a, const f64 tolerance = ROUNDING_ERROR_f64)
    {
        return fabs(a) <= tolerance;
    }
 
    //! returns if a equals zero, taking rounding errors into account
    inline bool iszero(const f32 a, const f32 tolerance = ROUNDING_ERROR_f32)
    {
        return fabsf(a) <= tolerance;
    }
 
    //! returns if a equals not zero, taking rounding errors into account
    inline bool isnotzero(const f32 a, const f32 tolerance = ROUNDING_ERROR_f32)
    {
        return fabsf(a) > tolerance;
    }
 
    //! returns if a equals zero, taking rounding errors into account
    inline bool iszero(const s32 a, const s32 tolerance = 0)
    {
        return ( a & 0x7ffffff ) <= tolerance;
    }
 
    //! returns if a equals zero, taking rounding errors into account
    inline bool iszero(const u32 a, const u32 tolerance = 0)
    {
        return a <= tolerance;
    }
 
#ifdef __IRR_HAS_S64
    //! returns if a equals zero, taking rounding errors into account
    inline bool iszero(const s64 a, const s64 tolerance = 0)
    {
        return abs_(a) <= tolerance;
    }
#endif
 
    inline s32 s32_min(s32 a, s32 b)
    {
        const s32 mask = (a - b) >> 31;
        return (a & mask) | (b & ~mask);
    }
 
    inline s32 s32_max(s32 a, s32 b)
    {
        const s32 mask = (a - b) >> 31;
        return (b & mask) | (a & ~mask);
    }
 
    inline s32 s32_clamp (s32 value, s32 low, s32 high)
    {
        return s32_min(s32_max(value,low), high);
    }
 
    /*
        float IEEE-754 bit represenation
 
        0      0x00000000
        1.0    0x3f800000
        0.5    0x3f000000
        3      0x40400000
        +inf   0x7f800000
        -inf   0xff800000
        +NaN   0x7fc00000 or 0x7ff00000
        in general: number = (sign ? -1:1) * 2^(exponent) * 1.(mantissa bits)
    */
 
    typedef union { u32 u; s32 s; f32 f; } inttofloat;
 
    #define F32_AS_S32(f)       (*((s32 *) &(f)))
    #define F32_AS_U32(f)       (*((u32 *) &(f)))
    #define F32_AS_U32_POINTER(f)   ( ((u32 *) &(f)))
 
    #define F32_VALUE_0     0x00000000
    #define F32_VALUE_1     0x3f800000
    #define F32_SIGN_BIT        0x80000000U
    #define F32_EXPON_MANTISSA  0x7FFFFFFFU
 
    //! code is taken from IceFPU
    //! Integer representation of a floating-point value.
#ifdef IRRLICHT_FAST_MATH
    #define IR(x)                           ((u32&)(x))
#else
    inline u32 IR(f32 x) {inttofloat tmp; tmp.f=x; return tmp.u;}
#endif
 
    //! Absolute integer representation of a floating-point value
    #define AIR(x)              (IR(x)&0x7fffffff)
 
    //! Floating-point representation of an integer value.
#ifdef IRRLICHT_FAST_MATH
    #define FR(x)                           ((f32&)(x))
#else
    inline f32 FR(u32 x) {inttofloat tmp; tmp.u=x; return tmp.f;}
    inline f32 FR(s32 x) {inttofloat tmp; tmp.s=x; return tmp.f;}
#endif
 
    //! integer representation of 1.0
    #define IEEE_1_0            0x3f800000
    //! integer representation of 255.0
    #define IEEE_255_0          0x437f0000
 
#ifdef IRRLICHT_FAST_MATH
    #define F32_LOWER_0(f)      (F32_AS_U32(f) >  F32_SIGN_BIT)
    #define F32_LOWER_EQUAL_0(f)    (F32_AS_S32(f) <= F32_VALUE_0)
    #define F32_GREATER_0(f)    (F32_AS_S32(f) >  F32_VALUE_0)
    #define F32_GREATER_EQUAL_0(f)  (F32_AS_U32(f) <= F32_SIGN_BIT)
    #define F32_EQUAL_1(f)      (F32_AS_U32(f) == F32_VALUE_1)
    #define F32_EQUAL_0(f)      ( (F32_AS_U32(f) & F32_EXPON_MANTISSA ) == F32_VALUE_0)
 
    // only same sign
    #define F32_A_GREATER_B(a,b)    (F32_AS_S32((a)) > F32_AS_S32((b)))
 
#else
 
    #define F32_LOWER_0(n)      ((n) <  0.0f)
    #define F32_LOWER_EQUAL_0(n)    ((n) <= 0.0f)
    #define F32_GREATER_0(n)    ((n) >  0.0f)
    #define F32_GREATER_EQUAL_0(n)  ((n) >= 0.0f)
    #define F32_EQUAL_1(n)      ((n) == 1.0f)
    #define F32_EQUAL_0(n)      ((n) == 0.0f)
    #define F32_A_GREATER_B(a,b)    ((a) > (b))
#endif
 
 
#ifndef REALINLINE
    #ifdef _MSC_VER
        #define REALINLINE __forceinline
    #else
        #define REALINLINE inline
    #endif
#endif
 
#if defined(__BORLANDC__) || defined (__BCPLUSPLUS__)
 
    // 8-bit bools in borland builder
 
    //! conditional set based on mask and arithmetic shift
    REALINLINE u32 if_c_a_else_b ( const c8 condition, const u32 a, const u32 b )
    {
        return ( ( -condition >> 7 ) & ( a ^ b ) ) ^ b;
    }
 
    //! conditional set based on mask and arithmetic shift
    REALINLINE u32 if_c_a_else_0 ( const c8 condition, const u32 a )
    {
        return ( -condition >> 31 ) & a;
    }
#else
 
    //! conditional set based on mask and arithmetic shift
    REALINLINE u32 if_c_a_else_b ( const s32 condition, const u32 a, const u32 b )
    {
        return ( ( -condition >> 31 ) & ( a ^ b ) ) ^ b;
    }
 
    //! conditional set based on mask and arithmetic shift
    REALINLINE u16 if_c_a_else_b ( const s16 condition, const u16 a, const u16 b )
    {
        return ( ( -condition >> 15 ) & ( a ^ b ) ) ^ b;
    }
 
    //! conditional set based on mask and arithmetic shift
    REALINLINE u32 if_c_a_else_0 ( const s32 condition, const u32 a )
    {
        return ( -condition >> 31 ) & a;
    }
#endif
 
    /*
        if (condition) state |= m; else state &= ~m;
    */
    REALINLINE void setbit_cond ( u32 &state, s32 condition, u32 mask )
    {
        // 0, or any postive to mask
        //s32 conmask = -condition >> 31;
        state ^= ( ( -condition >> 31 ) ^ state ) & mask;
    }
 
    inline f32 round_( f32 x )
    {
        return floorf( x + 0.5f );
    }
 
    REALINLINE void clearFPUException ()
    {
#ifdef IRRLICHT_FAST_MATH
        return;
#ifdef feclearexcept
        feclearexcept(FE_ALL_EXCEPT);
#elif defined(_MSC_VER)
        __asm fnclex;
#elif defined(__GNUC__) && defined(__x86__)
        __asm__ __volatile__ ("fclex \n\t");
#else
//#  warn clearFPUException not supported.
#endif
#endif
    }
 
    // calculate: sqrt ( x )
    REALINLINE f32 squareroot(const f32 f)
    {
        return sqrtf(f);
    }
 
    // calculate: sqrt ( x )
    REALINLINE f64 squareroot(const f64 f)
    {
        return sqrt(f);
    }
 
    // calculate: sqrt ( x )
    REALINLINE s32 squareroot(const s32 f)
    {
        return static_cast<s32>(squareroot(static_cast<f32>(f)));
    }
 
#ifdef __IRR_HAS_S64
    // calculate: sqrt ( x )
    REALINLINE s64 squareroot(const s64 f)
    {
        return static_cast<s64>(squareroot(static_cast<f64>(f)));
    }
#endif
 
    // calculate: 1 / sqrt ( x )
    REALINLINE f64 reciprocal_squareroot(const f64 x)
    {
#if defined ( IRRLICHT_FAST_MATH )
        double result = 1.0 / sqrt(x);
        //! pending perf test
        //_mm_store_sd(&result,_mm_div_sd(_mm_set_pd(0.0,1.0),_mm_sqrt_sd(_mm_load_sd(&x))));
        return result;
#else // no fast math
        return 1.0 / sqrt(x);
#endif
    }
 
    // calculate: 1 / sqrtf ( x )
    REALINLINE f32 reciprocal_squareroot(const f32 f)
    {
#if defined ( IRRLICHT_FAST_MATH )
        float result;
        _mm_store_ss(&result,_mm_rsqrt_ps(_mm_load_ss(&f)));
        return result;
#else // no fast math
        return 1.f / sqrtf(f);
#endif
    }
 
    // calculate: 1 / sqrtf( x )
    REALINLINE s32 reciprocal_squareroot(const s32 x)
    {
        return static_cast<s32>(reciprocal_squareroot(static_cast<f32>(x)));
    }
 
    // calculate: 1 / x
    REALINLINE f32 reciprocal( const f32 f )
    {
#if defined (IRRLICHT_FAST_MATH)
        float result;
        _mm_store_ss(&result,_mm_rcp_ps(_mm_load_ss(&f)));
        return result;
#else // no fast math
        return 1.f / f;
#endif
    }
 
    // calculate: 1 / x
    REALINLINE f64 reciprocal ( const f64 f )
    {
        return 1.0 / f;
    }
 
 
    // calculate: 1 / x, low precision allowed
    REALINLINE f32 reciprocal_approxim ( const f32 f )
    {
        //what was here before was not faster
        return reciprocal(f);
    }
 
 
    REALINLINE s32 floor32(f32 x)
    {
#ifdef IRRLICHT_FAST_MATH
        const f32 h = 0.5f;
 
        s32 t;
 
#if defined(_MSC_VER)
        __asm
        {
            fld x
            fsub    h
            fistp   t
        }
#elif defined(__GNUC__)
        __asm__ __volatile__ (
            "fsub %2 \n\t"
            "fistpl %0"
            : "=m" (t)
            : "t" (x), "f" (h)
            : "st"
            );
#else
//#  warn IRRLICHT_FAST_MATH not supported.
        return (s32) floorf ( x );
#endif
        return t;
#else // no fast math
        return (s32) floorf ( x );
#endif
    }
 
 
    REALINLINE s32 ceil32 ( f32 x )
    {
#ifdef IRRLICHT_FAST_MATH
        const f32 h = 0.5f;
 
        s32 t;
 
#if defined(_MSC_VER)
        __asm
        {
            fld x
            fadd    h
            fistp   t
        }
#elif defined(__GNUC__)
        __asm__ __volatile__ (
            "fadd %2 \n\t"
            "fistpl %0 \n\t"
            : "=m"(t)
            : "t"(x), "f"(h)
            : "st"
            );
#else
//#  warn IRRLICHT_FAST_MATH not supported.
        return (s32) ceilf ( x );
#endif
        return t;
#else // not fast math
        return (s32) ceilf ( x );
#endif
    }
 
 
 
    REALINLINE s32 round32(f32 x)
    {
#if defined(IRRLICHT_FAST_MATH)
        s32 t;
 
#if defined(_MSC_VER)
        __asm
        {
            fld   x
            fistp t
        }
#elif defined(__GNUC__)
        __asm__ __volatile__ (
            "fistpl %0 \n\t"
            : "=m"(t)
            : "t"(x)
            : "st"
            );
#else
//#  warn IRRLICHT_FAST_MATH not supported.
        return (s32) round_(x);
#endif
        return t;
#else // no fast math
        return (s32) round_(x);
#endif
    }
 
    inline f32 f32_max3(const f32 a, const f32 b, const f32 c)
    {
        return a > b ? (a > c ? a : c) : (b > c ? b : c);
    }
 
    inline f32 f32_min3(const f32 a, const f32 b, const f32 c)
    {
        return a < b ? (a < c ? a : c) : (b < c ? b : c);
    }
 
    inline f32 fract ( f32 x )
    {
        return x - floorf ( x );
    }
 
} // end namespace core
} // end namespace irr
 
#ifndef IRRLICHT_FAST_MATH
    using irr::core::IR;
    using irr::core::FR;
#endif
 
#endif
 
 
devsh
Competition winner
Posts: 2057
Joined: Tue Dec 09, 2008 6:00 pm
Location: UK
Contact:

Re: WANT 4x SPEEDUPS on CPU-side CODE??? SIMD IRRLICHT VECTO

Post by devsh »

okay, we managed to compile the whole thing in MSVC and in SSE2 as well, also figured out a way to get SSE3 in MSVC

it turns out that IRR_FAST_MATH didnt really work in the first place, so we dont use it for now (until we fix it - in stock irrlicht)

finally implemented the replacement new operators etc.
got the swizzle in (but only for 4d vectors, 8d would require 16million overloaded functions XD)
implemented more irrlicht functions of vector3df (implemented all from irrMath.h)

P.S. I didnt update the listing cause I hit the character limit, I'll get soren to host the files when we're done
robmar
Posts: 1125
Joined: Sun Aug 14, 2011 11:30 pm

Re: WANT 4x SPEEDUPS on CPU-side CODE??? SIMD IRRLICHT VECTO

Post by robmar »

So what do we do to add this speed-up?
devsh
Competition winner
Posts: 2057
Joined: Tue Dec 09, 2008 6:00 pm
Location: UK
Contact:

Re: WANT 4x SPEEDUPS on CPU-side CODE??? SIMD IRRLICHT VECTO

Post by devsh »

probably adding the new headers from the listings in the first post... however I started doing 32bit integer vectors and didnt get the chance to finish them... so I'll let everyone know when I'll have a new version thats compilable and works in 32 and 16bit integer mode
robmar
Posts: 1125
Joined: Sun Aug 14, 2011 11:30 pm

Re: WANT 4x SPEEDUPS on CPU-side CODE??? SIMD IRRLICHT VECTO

Post by robmar »

Okay thanks, would be good to give it a try. I´m getting frame stuttering when the irrlicht camera flys a path and goes from a less complex part of the scene to a more complex part, or that´s how it looks.

I was wondering if that was because Irrlicht was starting to do the math for all those objects to be rendered, so am hoping that maybe this will improve it.
devsh
Competition winner
Posts: 2057
Joined: Tue Dec 09, 2008 6:00 pm
Location: UK
Contact:

Re: WANT 4x SPEEDUPS on CPU-side CODE??? SIMD IRRLICHT VECTO

Post by devsh »

THE vectorSIMD.h file containing our beautiful vector classes (part 1, doesn't fit in one post):

Code: Select all

// Copyright (C) 2014 Mateusz 'DevSH' Kielan
// This file is part of the "Irrlicht Engine".
// Contributed from "Build a World"
// For conditions of distribution and use, see copyright notice in irrlicht.h
 
#ifndef __IRR_VECTOR_SIMD_H_INCLUDED__
#define __IRR_VECTOR_SIMD_H_INCLUDED__
 
#include "IrrCompileConfig.h"
 
#ifdef __IRR_COMPILE_WITH_X86_SIMD_
 
#ifndef __IRR_COMPILE_WITH_SSE2
#error "Either give up on SIMD vectors, check your compiler settings for the -m*sse* flag, or upgrade your CPU"
#endif // __IRR_COMPILE_WITH_SSE2
 
#include "irrMath.h"
#include <stdint.h>
#include "SColor.h"
 
namespace irr
{
namespace core
{
    class vectorSIMDf;
    template <class T>
    class vectorSIMD_32;
    template <class T>
    class vectorSIMD_16;
 
    //a class for bitwise shizz
    template <int components> class vectorSIMDBool
    {
    public:
        inline vectorSIMDBool() {_mm_store_ps((float*)value,_mm_setzero_ps());}
        //! These constructors will bytewise cast the reg into the value
        inline vectorSIMDBool(const __m128 &reg) {_mm_store_ps((float*)value,reg);}
        inline vectorSIMDBool(const __m128d &reg) {_mm_store_pd((double*)value,reg);}
        inline vectorSIMDBool(const __m128i &reg) {_mm_store_si128((__m128i*)value,reg);}
        inline vectorSIMDBool(const vectorSIMDBool& other) {_mm_store_ps((float*)value,_mm_load_ps((float*)other.value));}
        //! reads 16 bytes from an array of uint8_t
        inline vectorSIMDBool(uint8_t* const &array) {_mm_store_ps((float*)value,_mm_loadu_ps((float*)array));}
        //! same as above, BUT WILL CRASH IF ARRAY NOT 16 BYTE ALIGNED
        inline vectorSIMDBool(uint8_t* const &array, bool ALIGNED) {_mm_store_ps((float*)value,_mm_load_ps((float*)array));}
        //! Constructor with the same value for all elements
        explicit vectorSIMDBool(const bool &n) {_mm_store_si128((__m128i*)value,n ? _mm_set_epi64x(-0x1ll,-0x1ll):_mm_setzero_si128());}
 
 
        inline vectorSIMDBool operator~() const { return _mm_xor_si128(getAsRegister(),_mm_set_epi64x(-0x1ll,-0x1ll)); }
        inline vectorSIMDBool operator&(const vectorSIMDBool &other) const { return _mm_and_si128(getAsRegister(),other.getAsRegister()); }
        inline vectorSIMDBool operator|(const vectorSIMDBool &other) const { return _mm_or_si128(getAsRegister(),other.getAsRegister()); }
        inline vectorSIMDBool operator^(const vectorSIMDBool &other) const { return _mm_xor_si128(getAsRegister(),other.getAsRegister()); }
 
/*
NO BITSHIFTING SUPPORT
*/
        inline vectorSIMDBool<components> operator!() const { return vectorSIMDBool<components>(); }
        inline vectorSIMDBool<components> operator&&(const vectorSIMDBool<components> &other) const { return vectorSIMDBool<components>(); }
        inline vectorSIMDBool<components> operator||(const vectorSIMDBool<components> &other) const { return vectorSIMDBool<components>(); }
 
 
        //! like GLSL, returns true if any bit of value is set
        inline bool any(void) const
        {
            return ((uint64_t*)value)[0]|((uint64_t*)value)[1];
        }
 
        //! like GLSL, returns true if all bits of value are set
        inline bool allBits(void) const
        {
            return (((uint64_t*)value)[0]&((uint64_t*)value)[1])==0xffffffffffffffffull;
        }
        //! like GLSL, returns true if all components non zero
        inline bool all(void) const
        {
            return 0;
        }
 
 
        //! in case you want to do your own SSE
        inline __m128i getAsRegister() const {_mm_load_si128((__m128i*)value);}
 
 
#ifdef _IRR_WINDOWS_
        __declspec(align(SIMD_ALIGNMENT)) uint8_t value[16];
    };
#else
    uint8_t value[16];
    } __attribute__ ((__aligned__(SIMD_ALIGNMENT)));
#endif
 
    //! partial specialization for variable width vectors
    template <>
    inline bool vectorSIMDBool<2>::all(void) const
    {
        return (((uint64_t*)value)[0]&&((uint64_t*)value)[1]);
    }
    template <>
    inline bool vectorSIMDBool<4>::all(void) const
    {
        return ((uint32_t*)value)[0]&&((uint32_t*)value)[1]&&((uint32_t*)value)[2]&&((uint32_t*)value)[3];
    }
    template <>
    inline bool vectorSIMDBool<8>::all(void) const
    {
        __m128i xmm0 = _mm_xor_si128(_mm_cmpeq_epi16(getAsRegister(),_mm_setzero_si128()),_mm_set_epi16(-1,-1,-1,-1,-1,-1,-1,-1));
        xmm0 = _mm_and_si128(xmm0,_mm_shuffle_epi32(xmm0,_MM_SHUFFLE(0,1,2,3))); // (0&&6,1&&7,  2&&4,3&&5,  ...)
        xmm0 = _mm_and_si128(xmm0,_mm_shufflelo_epi16(xmm0,_MM_SHUFFLE(1,0,3,2))); // (0&&2&&4&&6, 1&&3&&5&&7, ... )
        uint16_t tmpStorage[2];
        _mm_store_ss((float*)tmpStorage,_mm_castsi128_ps(xmm0));
        return tmpStorage[0]&tmpStorage[1];
    }/*
    template <>
    inline bool vectorSIMDBool<16>::all(void) const
    {
        __m128i xmm0 = _mm_xor_si128(_mm_cmpeq_epi8(getAsRegister(),_mm_setzero_si128()),_mm_set_epi8(-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1));
        xmm0 = _mm_and_si128(xmm0,_mm_shuffle_epi32(xmm0,_MM_SHUFFLE(0,1,2,3))); // (0&&12,1&&13,2&&14,3&&15,  4&&8,5&&9,6&&10,7&&11,  ...)
        xmm0 = _mm_and_si128(xmm0,_mm_shufflelo_epi16(xmm0,_MM_SHUFFLE(1,0,3,2))); // (0&&4&&8&&12,1&&5&&9&&13,2&&6&&10&&14,3&&7&&11&&15,  ...)
        xmm0 = _mm_and_si128(xmm0,_mm_slli_si128(xmm0,2)); // (even &&,odd &&,  ...)
        _mm_store_si128((__m128i*)tmpStorage,xmm0);
        return tmpStorage[0]&&tmpStorage[1];
    }
 
    //! following do ANDs (not bitwise ANDs)
    template <>
    inline vectorSIMDBool<2> vectorSIMDBool<2>::operator&&(const vectorSIMDBool<2> &other) const
    {
    }
    template <>
    inline vectorSIMDBool<2> vectorSIMDBool<2>::operator||(const vectorSIMDBool<2> &other) const
    {
    }
    template <>
    inline vectorSIMDBool<2> vectorSIMDBool<2>::operator!() const
    {
    }*/
    template <>
    inline vectorSIMDBool<4> vectorSIMDBool<4>::operator&&(const vectorSIMDBool<4> &other) const
    {
        __m128 xmm0 = _mm_and_ps(_mm_cmpneq_ps(_mm_castsi128_ps(other.getAsRegister()),_mm_setzero_ps()),_mm_cmpneq_ps(_mm_castsi128_ps(getAsRegister()),_mm_setzero_ps()));
        return vectorSIMDBool<4>(xmm0);
    }
    template <>
    inline vectorSIMDBool<4> vectorSIMDBool<4>::operator||(const vectorSIMDBool<4> &other) const
    {
        __m128i xmm0 = _mm_or_si128(other.getAsRegister(),getAsRegister());
        return vectorSIMDBool<4>(_mm_cmpneq_ps(_mm_castsi128_ps(xmm0),_mm_setzero_ps()));
    }
    template <>
    inline vectorSIMDBool<4> vectorSIMDBool<4>::operator!() const
    {
        return vectorSIMDBool<4>(_mm_cmpeq_ps(_mm_castsi128_ps(getAsRegister()),_mm_setzero_ps()));
    }
    template <>
    inline vectorSIMDBool<8> vectorSIMDBool<8>::operator&&(const vectorSIMDBool<8> &other) const
    {
        __m128i xmm0 = _mm_andnot_si128(_mm_cmpeq_epi16(other.getAsRegister(),_mm_setzero_si128()),_mm_xor_si128(_mm_cmpeq_epi16(getAsRegister(),_mm_setzero_si128()),_mm_set_epi16(-1,-1,-1,-1,-1,-1,-1,-1)));
        return vectorSIMDBool<8>(xmm0);
    }
    template <>
    inline vectorSIMDBool<8> vectorSIMDBool<8>::operator||(const vectorSIMDBool<8> &other) const
    {
        __m128i xmm0 = _mm_or_si128(other.getAsRegister(),getAsRegister());
        return vectorSIMDBool<8>(_mm_xor_si128(_mm_cmpeq_epi16(xmm0,_mm_setzero_si128()),_mm_set_epi16(-1,-1,-1,-1,-1,-1,-1,-1)));
    }
    template <>
    inline vectorSIMDBool<8> vectorSIMDBool<8>::operator!() const
    {
        return vectorSIMDBool<8>(_mm_cmpeq_epi16(getAsRegister(),_mm_setzero_si128()));
    }/*
    template <>
    inline vectorSIMDBool<16> vectorSIMDBool<16>::operator&&(const vectorSIMDBool<16> &other) const
    {
    }
    template <>
    inline vectorSIMDBool<16> vectorSIMDBool<16>::operator||(const vectorSIMDBool<16> &other) const
    {
    }
    template <>
    inline vectorSIMDBool<16> vectorSIMDBool<16>::operator!() const
    {
    }*/
 
 
    //! Typedef for N-bit wide boolean vectors
    //typedef vectorSIMDBool<16> vector16db_SIMD;
    typedef vectorSIMDBool<8> vector8db_SIMD;
    typedef vectorSIMDBool<4> vector4db_SIMD;
    //typedef vectorSIMDBool<2> vector2db_SIMD;
  

Code: Select all

 
 
#include "SIMDswizzle.h"
 
 
 
#ifdef _IRR_WINDOWS_
    __declspec(align(SIMD_ALIGNMENT)) class vectorSIMDf : public SIMD_32bitSwizzleAble<vectorSIMDf,__m128>
#else
    class vectorSIMDf : public SIMD_32bitSwizzleAble<vectorSIMDf,__m128>
#endif
    {
    public:
        //! Default constructor (null vector).
        inline vectorSIMDf() {_mm_store_ps(pointer,_mm_setzero_ps());}
        //! Constructor with four different values, FASTEST IF the values are constant literals
        //yes this is correct usage with _mm_set_**(), due to little endianness the thing gets set in "reverse" order
        inline explicit vectorSIMDf(const float &nx, const float &ny, const float &nz, const float &nw) {_mm_store_ps(pointer,_mm_set_ps(nw,nz,ny,nx));}
        //! 3d constructor
        inline explicit vectorSIMDf(const float &nx, const float &ny, const float &nz) {_mm_store_ps(pointer,_mm_set_ps(0.f,nz,ny,nx));}
        //! 2d constructor
        inline explicit vectorSIMDf(const float &nx, const float &ny) {_mm_store_ps(pointer,_mm_set_ps(0.f,0.f,ny,nx));}
        //! Fast Constructor from floats, they come in normal order [0]=X,[1]=Y, etc.
        inline vectorSIMDf(float* const &array) {_mm_store_ps(pointer,_mm_loadu_ps(array));}
        //! Fastest Constructor from floats, they come in normal order [0]=X,[1]=Y, etc.
        //! Address has to be aligned to 16bytes OR WILL CRASH
        inline vectorSIMDf(float* const &array, bool ALIGNED) {_mm_store_ps(pointer,_mm_load_ps(array));}
        //! Fastest and most natural constructor
        inline vectorSIMDf(const __m128 &reg) {_mm_store_ps(pointer,reg);}
        //! Constructor with the same value for all elements
        inline explicit vectorSIMDf(const float &n) {_mm_store_ps(pointer,_mm_load_ps1(&n));}
        //! Copy constructor
        inline vectorSIMDf(const vectorSIMDf& other) {_mm_store_ps(pointer,other.getAsRegister());}
 
 
        static inline void* operator new(size_t size) throw(std::bad_alloc)
        {
            void *memoryallocatedaligned = 0;
#ifdef _IRR_WINDOWS_
            memoryallocatedaligned = _aligned_malloc(size,SIMD_ALIGNMENT);
#else
            posix_memalign((void**)&memoryallocatedaligned,SIMD_ALIGNMENT,size);
#endif
            return memoryallocatedaligned;
        }
        static inline void operator delete(void* ptr)
        {
#ifdef _IRR_WINDOWS_
            _aligned_free(ptr);
#else
            free(ptr);
#endif
        }
        static inline void* operator new[](size_t size) throw(std::bad_alloc)
        {
            void *memoryallocatedaligned = 0;
#ifdef _IRR_WINDOWS_
            memoryallocatedaligned = _aligned_malloc(size,SIMD_ALIGNMENT);
#else
            posix_memalign((void**)&memoryallocatedaligned,SIMD_ALIGNMENT,size);
#endif
            return memoryallocatedaligned;
        }
        static inline void  operator delete[](void* ptr) throw()
        {
#ifdef _IRR_WINDOWS_
            _aligned_free(ptr);
#else
            free(ptr);
#endif
        }
        static inline void* operator new(std::size_t size,void* p) throw(std::bad_alloc)
        {
            return p;
        }
        static inline void  operator delete(void* p,void* t) throw() {}
        static inline void* operator new[](std::size_t size,void* p) throw(std::bad_alloc)
        {
            return p;
        }
        static inline void  operator delete[](void* p,void* t) throw() {}
/*
        inline vectorSIMDf(const vectorSIMDu32& other);
        inline vectorSIMDf(const vectorSIMDi32& other);
        inline vectorSIMDf(const vectorSIMDu16& other);
        inline vectorSIMDf(const vectorSIMDi16& other);
*/
 
        inline vectorSIMDf& operator=(const vectorSIMDf& other) { _mm_store_ps(pointer,other.getAsRegister()); return *this; }
 
        //! bitwise ops
        inline vectorSIMDf operator&(const vectorSIMDf& other) {return _mm_and_ps(getAsRegister(),other.getAsRegister());}
        inline vectorSIMDf operator|(const vectorSIMDf& other) {return _mm_or_ps(getAsRegister(),other.getAsRegister());}
        inline vectorSIMDf operator^(const vectorSIMDf& other) {return _mm_xor_ps(getAsRegister(),other.getAsRegister());}
 
        //! in case you want to do your own SSE
        inline __m128 getAsRegister() const {return _mm_load_ps(pointer);}
 
 
        // operators against vectors
        inline vectorSIMDf operator-() const { return _mm_xor_ps(_mm_castsi128_ps(_mm_set1_epi32(0x80000000u)),getAsRegister()); }
 
        inline vectorSIMDf operator+(const vectorSIMDf& other) const { return _mm_add_ps(other.getAsRegister(),getAsRegister()); }
        inline vectorSIMDf& operator+=(const vectorSIMDf& other) { _mm_store_ps(pointer,_mm_add_ps(other.getAsRegister(),getAsRegister())); return *this; }
 
        inline vectorSIMDf operator-(const vectorSIMDf& other) const { return _mm_sub_ps(getAsRegister(),other.getAsRegister()); }
        inline vectorSIMDf& operator-=(const vectorSIMDf& other) { _mm_store_ps(pointer,_mm_sub_ps(getAsRegister(),other.getAsRegister())); return *this; }
 
        inline vectorSIMDf operator*(const vectorSIMDf& other) const { return _mm_mul_ps(getAsRegister(),other.getAsRegister()); }
        inline vectorSIMDf& operator*=(const vectorSIMDf& other) { _mm_store_ps(pointer,_mm_mul_ps(getAsRegister(),other.getAsRegister())); return *this; }
#ifdef IRRLICHT_FAST_MATH
        inline vectorSIMDf operator/(const vectorSIMDf& other) const { return _mm_mul_ps(getAsRegister(),_mm_rcp_ps(other.getAsRegister())); }
        inline vectorSIMDf& operator/=(const vectorSIMDf& other) { _mm_store_ps(pointer,_mm_mul_ps(getAsRegister(),_mm_rcp_ps(other.getAsRegister()))); return *this; }
#else
        inline vectorSIMDf operator/(const vectorSIMDf& other) const { return preciseDivision(other); }
        inline vectorSIMDf& operator/=(const vectorSIMDf& other) { (*this) = preciseDivision(other); return *this; }
#endif
        inline vectorSIMDf preciseDivision(const vectorSIMDf& other) const { return _mm_div_ps(getAsRegister(),other.getAsRegister()); }
 
 
        //operators against scalars
        inline vectorSIMDf  operator+(const float &val) const { return (*this)+vectorSIMDf(val); }
        inline vectorSIMDf& operator+=(const float &val) { return ( (*this) += vectorSIMDf(val) ); }
 
        inline vectorSIMDf operator-(const float &val) const { return (*this)-vectorSIMDf(val); }
        inline vectorSIMDf& operator-=(const float &val) { return ( (*this) -= vectorSIMDf(val) ); }
 
        inline vectorSIMDf  operator*(const float &val) const { return (*this)*vectorSIMDf(val); }
        inline vectorSIMDf& operator*=(const float &val) { return ( (*this) *= vectorSIMDf(val) ); }
 
#ifdef IRRLICHT_FAST_MATH
        inline vectorSIMDf operator/(const float &v) const { return vectorSIMDf(_mm_mul_ps(_mm_rcp_ps(_mm_load_ps1(&v)),getAsRegister())); }
        inline vectorSIMDf& operator/=(const float &v) { _mm_store_ps(pointer,_mm_mul_ps(_mm_rcp_ps(_mm_load_ps1(&v)),getAsRegister())); return *this; }
#else
        inline vectorSIMDf operator/(const float &v) const { return vectorSIMDf(_mm_div_ps(getAsRegister(),_mm_load_ps1(&v))); }
        inline vectorSIMDf& operator/=(const float &v) { _mm_store_ps(pointer,_mm_div_ps(getAsRegister(),_mm_load_ps1(&v))); return *this; }
#endif
 
        //! I AM BREAKING IRRLICHT'S COMPARISON OPERATORS
        inline vector4db_SIMD operator<=(const vectorSIMDf& other) const
        {
            return _mm_cmple_ps(getAsRegister(),other.getAsRegister());
        }
        inline vector4db_SIMD operator>=(const vectorSIMDf& other) const
        {
            return _mm_cmpge_ps(getAsRegister(),other.getAsRegister());
        }
        inline vector4db_SIMD operator<(const vectorSIMDf& other) const
        {
            return _mm_cmplt_ps(getAsRegister(),other.getAsRegister());
        }
        inline vector4db_SIMD operator>(const vectorSIMDf& other) const
        {
            return _mm_cmpgt_ps(getAsRegister(),other.getAsRegister());
        }
 
        //! only the method that returns bool confirms if two vectors are exactly the same
        inline vectorSIMDf operator==(const vectorSIMDf& other) const
        {
            return _mm_cmpeq_ps(getAsRegister(),other.getAsRegister());
        }
        inline vectorSIMDf operator!=(const vectorSIMDf& other) const
        {
            return _mm_cmpneq_ps(getAsRegister(),other.getAsRegister());
        }
 
 
 
        // functions
        //! zeroes out out of range components (useful before performing a dot product so it doesnt get polluted with random values)
        //! WARNING IT DOES COST CYCLES
        inline void makeSafe2D(void) {_mm_store_ps(pointer,_mm_and_ps(_mm_load_ps(pointer),_mm_castsi128_ps(_mm_set_epi32(0,0,-1,-1))));}
        inline void makeSafe3D(void) {_mm_store_ps(pointer,_mm_and_ps(_mm_load_ps(pointer),_mm_castsi128_ps(_mm_set_epi32(0,-1,-1,-1))));}
 
        //! slightly faster than memcpy'ing into the pointers
        inline vectorSIMDf& set(float* const &array) {_mm_store_ps(pointer,_mm_loadu_ps(array)); return *this;}
        //! FASTEST WAY TO SET VALUES, Address has to be aligned to 16bytes OR WILL CRASH
        inline vectorSIMDf& set(float* const &array, bool ALIGNED) {_mm_store_ps(pointer,_mm_load_ps(array));}
        //! normal set() like vector3df's, but for different dimensional vectors
        inline vectorSIMDf& set(const float &nx, const float &ny, const float &nz, const float &nw) {_mm_store_ps(pointer,_mm_set_ps(nw,nz,ny,nx)); return *this;}
        inline vectorSIMDf& set(const float &nx, const float &ny, const float &nz) {_mm_store_ps(pointer,_mm_set_ps(0.f,nz,ny,nx)); return *this;}
        inline vectorSIMDf& set(const float &nx, const float &ny) {_mm_store_ps(pointer,_mm_set_ps(0.f,0.f,ny,nx)); return *this;}
        inline vectorSIMDf& set(const vectorSIMDf& p) {_mm_store_ps(pointer,p.getAsRegister()); return *this;}
        //! convert from vectorNdf types of irrlicht - it will read a few values past the range of the allocated memory but _mm_loadu_ps shouldnt have that kind of protection
        inline vectorSIMDf& set(const vector3df &p) {_mm_store_ps(pointer,_mm_loadu_ps(&p.X)); makeSafe3D(); return *this;}
        inline vectorSIMDf& set(const vector2df &p) {_mm_store_ps(pointer,_mm_loadu_ps(&p.X)); makeSafe2D(); return *this;}
 
        //! going directly from vectorSIMD to irrlicht types is safe cause vectorSIMDf is wider
        inline vector2df& getAsVector2df(void) const
        {
            return *((vector2df*)pointer);
        }
        inline vector3df& getAsVector3df(void) const
        {
            return *((vector3df*)pointer);
        }
 
 
        //! Get length of the vector.
        inline float getLengthAsFloat() const
        {
            __m128 xmm0 = getAsRegister();
            float result;
#ifdef __IRR_COMPILE_WITH_SSE3
            xmm0 = _mm_mul_ps(xmm0,xmm0);
            xmm0 = _mm_hadd_ps(xmm0,xmm0);
            xmm0 = _mm_sqrt_ps(_mm_hadd_ps(xmm0,xmm0));
            _mm_store_ss(&result,xmm0);
            return result;
#elif defined(__IRR_COMPILE_WITH_SSE2)
            xmm0 = _mm_mul_ps(xmm0,xmm0);
            xmm0 = _mm_add_ps(xmm0,FAST_FLOAT_SHUFFLE(xmm0,_MM_SHUFFLE(0,1,2,3)));
            xmm0 = _mm_add_ps(xmm0,FAST_FLOAT_SHUFFLE(xmm0,_MM_SHUFFLE(2,3,0,1)));
            xmm0 = _mm_sqrt_ps(xmm0);
            _mm_store_ss(&result,xmm0);
            return result;
#endif
        }
        //! Useful when you have to divide a vector by another vector's length (so you dont convert/store to a scalar)
        //! all components are filled with length
        //! if you need something else, you can get the register and shuffle
        inline vectorSIMDf getLength() const
        {
            __m128 xmm0 = getAsRegister();
#ifdef __IRR_COMPILE_WITH_SSE3
            xmm0 = _mm_mul_ps(xmm0,xmm0);
            xmm0 = _mm_hadd_ps(xmm0,xmm0);
            return _mm_sqrt_ps(_mm_hadd_ps(xmm0,xmm0));
#elif defined(__IRR_COMPILE_WITH_SSE2)
            xmm0 = _mm_mul_ps(xmm0,xmm0);
            xmm0 = _mm_add_ps(xmm0,FAST_FLOAT_SHUFFLE(xmm0,_MM_SHUFFLE(0,1,2,3)));
            xmm0 = _mm_add_ps(xmm0,FAST_FLOAT_SHUFFLE(xmm0,_MM_SHUFFLE(2,3,0,1)));
            return _mm_sqrt_ps(xmm0);
#endif
        }
 
 
        inline vectorSIMDf getSquareRoot() const
        {
            return _mm_sqrt_ps(getAsRegister());
        }
 
        inline vectorSIMDf getReciprocalSQRT() const
        {
            return _mm_rsqrt_ps(getAsRegister());
        }
 
        //! Get the dot product with another vector.
        inline float dotProductAsFloat(const vectorSIMDf& other) const
        {
            float result;
            __m128 xmm0 = getAsRegister();
            __m128 xmm1 = other.getAsRegister();/*
#ifdef __IRR_COMPILE_WITH_SSE4_1
            xmm0 = _mm_dp_ps(xmm0,xmm1,);
#error "Implementation in >=SSE4.1 not ready yet"
#elif __IRR_COMPILE_WITH_SSE3*/
#ifdef __IRR_COMPILE_WITH_SSE3
            xmm0 = _mm_mul_ps(xmm0,xmm1);
            xmm0 = _mm_hadd_ps(xmm0,xmm0);
            xmm0 = _mm_hadd_ps(xmm0,xmm0);
            _mm_store_ss(&result,xmm0);
            return result;
#elif defined(__IRR_COMPILE_WITH_SSE2)
            xmm0 = _mm_mul_ps(xmm0,xmm1);
            xmm0 = _mm_add_ps(xmm0,FAST_FLOAT_SHUFFLE(xmm0,_MM_SHUFFLE(0,1,2,3)));
            xmm0 = _mm_add_ps(xmm0,FAST_FLOAT_SHUFFLE(xmm0,_MM_SHUFFLE(2,3,0,1)));
            _mm_store_ss(&result,xmm0);
            return result;
#endif
        }
 
        inline vectorSIMDf dotProduct(const vectorSIMDf& other) const
        {
            __m128 xmm0 = getAsRegister();
            __m128 xmm1 = other.getAsRegister();
#ifdef __IRR_COMPILE_WITH_SSE3
            xmm0 = _mm_mul_ps(xmm0,xmm1);
            xmm0 = _mm_hadd_ps(xmm0,xmm0);
            return _mm_hadd_ps(xmm0,xmm0);
#elif defined(__IRR_COMPILE_WITH_SSE2)
            xmm0 = _mm_mul_ps(xmm0,xmm1);
            xmm0 = _mm_add_ps(xmm0,FAST_FLOAT_SHUFFLE(xmm0,_MM_SHUFFLE(0,1,2,3)));
            return _mm_add_ps(xmm0,FAST_FLOAT_SHUFFLE(xmm0,_MM_SHUFFLE(2,3,0,1)));
#endif
        }
Last edited by devsh on Fri May 01, 2015 12:23 pm, edited 5 times in total.
devsh
Competition winner
Posts: 2057
Joined: Tue Dec 09, 2008 6:00 pm
Location: UK
Contact:

Re: WANT 4x SPEEDUPS on CPU-side CODE??? SIMD IRRLICHT VECTO

Post by devsh »

(part 2, file didn't fit in one post):

Code: Select all

 
        //! Get squared length of the vector.
        /** This is useful because it is much faster than getLength().
        \return Squared length of the vector. **/
        inline float getLengthSQAsFloat() const
        {
            float result;
            _mm_store_ss(&result,dotProduct(*this).getAsRegister());
            return result;
        }
        //! Useful when you have to divide a vector by another vector's length (so you dont convert/store to a scalar)
        inline vectorSIMDf getLengthSQ() const
        {
            return dotProduct(*this);
        }
 
 
        //! Get distance from another point.
        /** Here, the vector is interpreted as point in 3 dimensional space. **/
        inline float getDistanceFromAsFloat(const vectorSIMDf& other) const
        {
            float result;
            _mm_store_ss(&result,((*this)-other).getLength().getAsRegister());
            return result;
        }
        inline vectorSIMDf getDistanceFrom(const vectorSIMDf& other) const
        {
            return ((*this)-other).getLength();
        }
 
        //! Returns squared distance from another point.
        /** Here, the vector is interpreted as point in 3 dimensional space. **/
        inline float getDistanceFromSQAsFloat(const vectorSIMDf& other) const
        {
            float result;
            _mm_store_ss(&result,((*this)-other).getLengthSQ().getAsRegister());
            return result;
        }
        inline vectorSIMDf getDistanceFromSQ(const vectorSIMDf& other) const
        {
            return ((*this)-other).getLengthSQ();
        }
 
        //! Calculates the cross product with another vector.
        /** \param p Vector to multiply with.
        \return Crossproduct of this vector with p. **/
        inline vectorSIMDf crossProduct(const vectorSIMDf& p) const
        {
            __m128 xmm0 = getAsRegister();
            __m128 xmm1 = p.getAsRegister();
#ifdef __IRR_COMPILE_WITH_SSE2 //! SSE2 implementation is faster than previous SSE3 implementation
            __m128 backslash = _mm_mul_ps(FAST_FLOAT_SHUFFLE(xmm0,_MM_SHUFFLE(3,0,2,1)),FAST_FLOAT_SHUFFLE(xmm1,_MM_SHUFFLE(3,1,0,2)));
            __m128 forwardslash = _mm_mul_ps(FAST_FLOAT_SHUFFLE(xmm0,_MM_SHUFFLE(3,1,0,2)),FAST_FLOAT_SHUFFLE(xmm1,_MM_SHUFFLE(3,0,2,1)));
            return _mm_sub_ps(backslash,forwardslash); //returns 0 in the last component :D
#endif
        }
 
        //! Normalizes the vector.
        /** In case of the 0 vector the result is still 0, otherwise
        the length of the vector will be 1.
        \return Reference to this vector after normalization. **/
        inline vectorSIMDf normalize() const
        {
            __m128 xmm0 = getAsRegister();
            __m128 xmm1 = getLengthSQ().getAsRegister();// the uncecessary load/store and variable construction will get optimized out with inline
#ifdef IRRLICHT_FAST_MATH
            return _mm_mul_ps(xmm0,_mm_rsqrt_ps(xmm1));
#else
            return _mm_div_ps(xmm0,_mm_sqrt_ps(xmm1));
#endif
        }
 
        //! Sets the length of the vector to a new value
        inline vectorSIMDf& setLengthAsFloat(float newlength)
        {
            (*this) = normalize()*newlength;
            return (*this);
        }
 
        //! Inverts the vector.
        inline vectorSIMDf& invert()
        {
            _mm_store_ps(pointer,_mm_xor_ps(_mm_castsi128_ps(_mm_set_epi32(0x80000000u,0x80000000u,0x80000000u,0x80000000u)),getAsRegister()));
            return *this;
        }
        //! Returns component-wise absolute value of a
        inline vectorSIMDf abs(const vectorSIMDf& a) const
        {
            return _mm_and_ps(a.getAsRegister(),_mm_castsi128_ps(_mm_set_epi32(0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF)));
        }
        //! Returns component-wise absolute value of itself
        inline vectorSIMDf getAbsoluteValue() const
        {
            return abs(*this);
        } 

Code: Select all

 
 
 
        //! Rotates the vector by a specified number of RADIANS around the Y axis and the specified center.
        /** \param radians Number of RADIANS to rotate around the Y axis.
        \param center The center of the rotation. */
        inline void rotateXZByRAD(const float &radians, const vectorSIMDf& center)
        {
            __m128 xmm1 = center.getAsRegister();
            __m128 xmm0 = _mm_sub_ps(getAsRegister(),xmm1);
 
            float cs = cosf(radians);
            float sn = sinf(radians);
            __m128 xmm2 = _mm_mul_ps(_mm_load_ps1(&cs),xmm0); // now contains (X*cos,radom_crap,Z*cos,random_crap)
            __m128 xmm3 = _mm_mul_ps(_mm_load_ps1(&sn),FAST_FLOAT_SHUFFLE(xmm0,_MM_SHUFFLE(3,0,1,2))); // now contains (Z*sin,radom_crap,X*cos,random_crap)
            xmm3 = _mm_xor_ps(xmm3,_mm_castsi128_ps(_mm_set_epi32(0,0,0,0x80000000u))); // invert the Z*sin
            xmm0 = _mm_add_ps(_mm_add_ps(xmm2,xmm3),xmm1); // gives us ((X*cs - Z*sn), (X*cs - Z*sn), (X*sn + Z*cs), (X*sn + Z*cs))
 
            _mm_maskmoveu_si128(_mm_castps_si128(xmm0),_mm_set_epi32(0,-1,0,-1),(char*)pointer);// only overwrites the X,Z elements of our vector
        }
        inline void rotateXZByRAD(const float &radians)
        {
            __m128 xmm0 = getAsRegister();
 
            float cs = cosf(radians);
            float sn = sinf(radians);
            __m128 xmm2 = _mm_mul_ps(_mm_load_ps1(&cs),xmm0); // now contains (X*cos,radom_crap,Z*cos,random_crap)
            __m128 xmm3 = _mm_mul_ps(_mm_load_ps1(&sn),FAST_FLOAT_SHUFFLE(xmm0,_MM_SHUFFLE(3,0,1,2))); // now contains (Z*sin,radom_crap,X*cos,random_crap)
            xmm3 = _mm_xor_ps(xmm3,_mm_castsi128_ps(_mm_set_epi32(0,0,0,0x80000000u))); // invert the Z*sin
            xmm0 = _mm_add_ps(xmm2,xmm3); // gives us ((X*cs - Z*sn), (X*cs - Z*sn), (X*sn + Z*cs), (X*sn + Z*cs))
 
            _mm_maskmoveu_si128(_mm_castps_si128(xmm0),_mm_set_epi32(0,-1,0,-1),(char*)pointer);// only overwrites the X,Z elements of our vector
        }
 
        //! Rotates the vector by a specified number of RADIANS around the Z axis and the specified center.
        /** \param RADIANS: Number of RADIANS to rotate around the Z axis.
        \param center: The center of the rotation. */
        inline void rotateXYByRAD(const float &radians, const vectorSIMDf& center)
        {
            __m128 xmm1 = center.getAsRegister();
            __m128 xmm0 = _mm_sub_ps(getAsRegister(),xmm1);
 
            float cs = cosf(radians);
            float sn = sinf(radians);
            __m128 xmm2 = _mm_mul_ps(_mm_load_ps1(&cs),xmm0); // now contains (X*cos,Y*cos,...,...)
            __m128 xmm3 = _mm_mul_ps(_mm_load_ps1(&sn),FAST_FLOAT_SHUFFLE(xmm0,_MM_SHUFFLE(3,2,0,1))); // now contains (Y*sin,X*cos,...)
            xmm3 = _mm_xor_ps(xmm3,_mm_castsi128_ps(_mm_set_epi32(0,0,0,0x80000000u))); // invert the Y*sin
            xmm0 = _mm_add_ps(_mm_add_ps(xmm2,xmm3),xmm1); // gives us ((X*cs - Y*sn), (Y*cs + X*sn),...,...)
 
            _mm_maskmoveu_si128(_mm_castps_si128(xmm0),_mm_set_epi32(0,0,-1,-1),(char*)pointer);// only overwrites the X,Y elements of our vector
        }
        inline void rotateXYByRAD(const float &radians)
        {
            __m128 xmm0 = getAsRegister();
 
            float cs = cosf(radians);
            float sn = sinf(radians);
            __m128 xmm2 = _mm_mul_ps(_mm_load_ps1(&cs),xmm0); // now contains (X*cos,Y*cos,...,...)
            __m128 xmm3 = _mm_mul_ps(_mm_load_ps1(&sn),FAST_FLOAT_SHUFFLE(xmm0,_MM_SHUFFLE(3,2,0,1))); // now contains (Y*sin,X*sin,...)
            xmm3 = _mm_xor_ps(xmm3,_mm_castsi128_ps(_mm_set_epi32(0,0,0,0x80000000u))); // invert the Y*sin
            xmm0 = _mm_add_ps(xmm2,xmm3); // gives us ((X*cs - Y*sn), (Y*cs + X*sn),...,...)
 
            _mm_maskmoveu_si128(_mm_castps_si128(xmm0),_mm_set_epi32(0,0,-1,-1),(char*)pointer);// only overwrites the X,Y elements of our vector
        }
 
        //! Rotates the vector by a specified number of degrees around the X axis and the specified center.
        /** \param degrees: Number of degrees to rotate around the X axis.
        \param center: The center of the rotation. */
        inline void rotateYZByRAD(const float &radians, const vectorSIMDf& center)
        {
            __m128 xmm1 = center.getAsRegister();
            __m128 xmm0 = _mm_sub_ps(getAsRegister(),xmm1);
 
            float cs = cosf(radians);
            float sn = sinf(radians);
            __m128 xmm2 = _mm_mul_ps(_mm_load_ps1(&cs),xmm0); // now contains (X*cos,Y*cos,...,...)
            __m128 xmm3 = _mm_mul_ps(_mm_load_ps1(&sn),FAST_FLOAT_SHUFFLE(xmm0,_MM_SHUFFLE(3,1,2,0))); // now contains (...,Z*sin,Y*sin,...)
            xmm3 = _mm_xor_ps(xmm3,_mm_castsi128_ps(_mm_set_epi32(0,0,0x80000000u,0))); // invert the Z*sin
            xmm0 = _mm_add_ps(_mm_add_ps(xmm2,xmm3),xmm1); // gives us ((X*cs - Y*sn), (Y*cs + X*sn),...,...)
 
            _mm_maskmoveu_si128(_mm_castps_si128(xmm0),_mm_set_epi32(0,-1,-1,0),(char*)pointer);// only overwrites the X,Y elements of our vector
        }
        inline void rotateYZByRAD(const float &radians)
        {
            __m128 xmm0 = getAsRegister();
 
            float cs = cosf(radians);
            float sn = sinf(radians);
            __m128 xmm2 = _mm_mul_ps(_mm_load_ps1(&cs),xmm0); // now contains (X*cos,Y*cos,...,...)
            __m128 xmm3 = _mm_mul_ps(_mm_load_ps1(&sn),FAST_FLOAT_SHUFFLE(xmm0,_MM_SHUFFLE(3,1,2,0))); // now contains (...,Z*sin,Y*sin,...)
            xmm3 = _mm_xor_ps(xmm3,_mm_castsi128_ps(_mm_set_epi32(0,0,0x80000000u,0))); // invert the Z*sin
            xmm0 = _mm_add_ps(xmm2,xmm3); // gives us ((X*cs - Y*sn), (Y*cs + X*sn),...,...)
 
            _mm_maskmoveu_si128(_mm_castps_si128(xmm0),_mm_set_epi32(0,-1,-1,0),(char*)pointer);// only overwrites the X,Y elements of our vector
        }
 
 
 
        //! Get the rotations that would make a (0,0,1) direction vector point in the same direction as this direction vector.
        /* Thanks to Arras on the Irrlicht forums for this method.  This utility method is very useful for
        orienting scene nodes towards specific targets.  For example, if this vector represents the difference
        between two scene nodes, then applying the result of getHorizontalAngle() to one scene node will point
        it at the other one.
        Example code:
        // Where target and seeker are of type ISceneNode*
        const vector3df toTarget(target->getAbsolutePosition() - seeker->getAbsolutePosition());
        const vector3df requiredRotation = toTarget.getHorizontalAngle();
        seeker->setRotation(requiredRotation);
 
        \return A rotation vector containing the X (pitch) and Y (raw) rotations (in degrees) that when applied to a
        +Z (e.g. 0, 0, 1) direction vector would make it point in the same direction as this vector. The Z (roll) rotation
        is always 0, since two Euler rotations are sufficient to point in any given direction. *
        inline vectorSIMDf getHorizontalAngle3D() const
        {
            vectorSIMDf angle;
 
            const float tmp = atan2f(x,z);
            angle.y = tmp;
 
            __m128 xmm0 = ((*this)*(*this)).getAsRegister();
            xmm0 = _mm_add_ps(xmm0,FAST_FLOAT_SHUFFLE(xmm0,_MM_SHUFFLE(3,0,1,2)));
            float z1;
            _mm_store_ss(&z1,_mm_sqrt_ss(xmm0));
 
            angle.x = atan2f(z1, y) - core::PI*0.5f;
 
            return angle;
        }
 
        //! Get the spherical coordinate angles, can we do 4-sphere coordinates
        /** This returns Euler radians for the point represented by
        this vector.
        *
        inline vectorSIMDf getSphericalCoordinates3D() const
        {
            vectorSIMDf angle = *this;
            angle.makeSafe3D();
            angle = angle.getLength();
 
            if (angle.w) //doesnt matter which component
            {
                if (X!=0)
                {
                    angle.Y = atan2f(Z,X);
                }
                else if (Z<0)
                    angle.Y=180;
 
                angle.X = (T)(acos(Y * core::reciprocal_squareroot(length)) * RADTODEG64);
            }
            else
                return vectorSIMDf(0.f);
        }
 
        //! Builds a direction vector from (this) rotation vector.
        /** This vector is assumed to be a rotation vector composed of 3 Euler angle rotations, in degrees.
        The implementation performs the same calculations as using a matrix to do the rotation.
 
        \param[in] forwards  The direction representing "forwards" which will be rotated by this vector.
        If you do not provide a direction, then the +Z axis (0, 0, 1) will be assumed to be forwards.
        \return A direction vector calculated by rotating the forwards direction by the 3 Euler angles
        (in degrees) represented by this vector. *
        inline vectorSIMDf rotationToDirection3D() const
        {
            const float cr = cosf( x );
            const float sr = sinf( x );
            const float cp = cosf( y );
            const float sp = sinf( y );
            const float cy = cosf( z );
            const float sy = sinf( z );
 
            const float crsp = cr*sp;
 
            return vectorSIMDf(( crsp*cy+sr*sy ), ( crsp*sy-sr*cy ), ( cr*cp ),0);
        }
        inline vectorSIMDf rotationToDirection3D(const vectorSIMDf &forwards = vectorSIMDf(0, 0, 1, 0)) const
        {
            const float cr = cosf( x );
            const float sr = sinf( x );
            const float cp = cosf( y );
            const float sp = sinf( y );
            const float cy = cosf( z );
            const float sy = sinf( z );
 
            const float crsp = cr*sp;
            const float srsp = sr*sp;
 
            const f64 pseudoMatrix[] = {
                ( cp*cy ), ( cp*sy ), ( -sp ),
                ( srsp*cy-cr*sy ), ( srsp*sy+cr*cy ), ( sr*cp ),
                ( crsp*cy+sr*sy ), ( crsp*sy-sr*cy ), ( cr*cp )};
 
            return vector3d<T>(
                (T)(forwards.X * pseudoMatrix[0] +
                    forwards.Y * pseudoMatrix[3] +
                    forwards.Z * pseudoMatrix[6]),
                (T)(forwards.X * pseudoMatrix[1] +
                    forwards.Y * pseudoMatrix[4] +
                    forwards.Z * pseudoMatrix[7]),
                (T)(forwards.X * pseudoMatrix[2] +
                    forwards.Y * pseudoMatrix[5] +
                    forwards.Z * pseudoMatrix[8]));
        }*/
 
        static inline vectorSIMDf fromSColor(const irr::video::SColor &col)
        {
            vectorSIMDf retVal;
 
            __m128i xmm0 = _mm_castps_si128(_mm_load_ss((float*)&col.color));
            xmm0 = _mm_unpacklo_epi8(xmm0,_mm_setzero_si128());
            xmm0 = _mm_unpacklo_epi16(xmm0,_mm_setzero_si128());
            __m128 xmm1 = _mm_div_ps(_mm_cvtepi32_ps(xmm0),_mm_set_ps(255.f,255.f,255.f,255.f));
            xmm1 = FAST_FLOAT_SHUFFLE(xmm1,_MM_SHUFFLE(3,0,1,2));
            _mm_store_ps(retVal.pointer,xmm1);
 
            return retVal;
        }
 
 
        union
        {
            struct{
                float X; float Y; float Z; float W;
            };
            struct{
                float x; float y; float z; float w;
            };
            struct{
                float r; float g; float b; float a;
            };
            struct{
                float s; float t; float p; float q;
            };
            float pointer[4];
        };
#ifdef _IRR_WINDOWS_
    };
#else
    } __attribute__ ((__aligned__(SIMD_ALIGNMENT)));
#endif
 
 
    static inline vectorSIMDf radToDeg(const vectorSIMDf& radians)
    {
        return radians*vectorSIMDf(RADTODEG);
    }
    static inline vectorSIMDf degToRad(const vectorSIMDf& degrees)
    {
        return degrees*vectorSIMDf(DEGTORAD);
    }
    static inline vectorSIMDf mix(const vectorSIMDf& a, const vectorSIMDf& b, const vectorSIMDf& t)
    {
        return a+(b-a)*t;
    }
    static inline vectorSIMDf lerp(const vectorSIMDf& a, const vectorSIMDf& b, const vectorSIMDf& t)
    {
        return mix(a,b,t);
    }
    template<>
    inline vectorSIMDf max_(const vectorSIMDf& a, const vectorSIMDf& b)
    {
        return _mm_max_ps(a.getAsRegister(),b.getAsRegister());
    }
    template<>
    inline vectorSIMDf min_(const vectorSIMDf& a, const vectorSIMDf& b)
    {
        return _mm_min_ps(a.getAsRegister(),b.getAsRegister());
    }
    inline vectorSIMDf clamp(const vectorSIMDf& value, const vectorSIMDf& low, const vectorSIMDf& high)
    {
        return min_(max_(value,low),high);
    }
    inline vectorSIMDf floor(const vectorSIMDf& a)
    {
        vectorSIMDf b = a;
        vector4db_SIMD notTooLargeToFloor = b.getAbsoluteValue()<vectorSIMDf(float(0x800000)); //cutoff point for flooring
        __m128i xmm0 = _mm_cvtps_epi32(b.getAsRegister());
        _mm_maskmoveu_si128(_mm_castps_si128(_mm_cvtepi32_ps(xmm0)),notTooLargeToFloor.getAsRegister(),(char*)b.pointer);
        return b;
    }
    inline vectorSIMDf fract(const vectorSIMDf& a)
    {
        return a-floor(a);
    }
    inline vectorSIMDf sqrt(const vectorSIMDf& a)
    {
        return _mm_sqrt_ps(a.getAsRegister());
    }
    inline vectorSIMDf inversesqrt(const vectorSIMDf& a)
    {
        return _mm_rsqrt_ps(a.getAsRegister());
    }
    inline vectorSIMDf reciprocal(const vectorSIMDf& a)
    {
        return _mm_rcp_ps(a.getAsRegister());
    }
 
    //! Typedef for a f32 n-dimensional vector.
    typedef vectorSIMDf vector4df_SIMD;
    typedef vectorSIMDf vector3df_SIMD;
    typedef vectorSIMDf vector2df_SIMD;
 
    template <class T>
    class vectorSIMD_32 : public SIMD_32bitSwizzleAble<vectorSIMD_32<T>,__m128i>
    {
    public:
        //! Default constructor (null vector).
        inline vectorSIMD_32() {_mm_store_si128((__m128i*)pointer,_mm_setzero_si128());}
        inline vectorSIMD_32(T* const &array) {_mm_store_si128((__m128i*)pointer,_mm_loadu_si128((__m128i*)array));}
        inline vectorSIMD_32(T* const &array, bool ALIGNED) {_mm_store_si128((__m128i*)pointer,_mm_load_si128((__m128i*)array));}
        //! Fastest and most natural constructor
        inline vectorSIMD_32(const __m128i &reg) {_mm_store_si128((__m128i*)pointer,reg);}
        //! Constructor with the same value for all elements
        inline explicit vectorSIMD_32(const T &n) {_mm_store_si128((__m128i*)pointer,_mm_castps_si128(_mm_load_ps1((float*)&n)));}
        //! Copy constructor
        inline vectorSIMD_32(const vectorSIMD_32<T>& other) {_mm_store_si128((__m128i*)pointer,other.getAsRegister());}
 
 
        static inline void* operator new(size_t size) throw(std::bad_alloc)
        {
            void *memoryallocatedaligned = 0;
#ifdef _IRR_WINDOWS_
            memoryallocatedaligned = _aligned_malloc(size,SIMD_ALIGNMENT);
#else
            posix_memalign((void**)&memoryallocatedaligned,SIMD_ALIGNMENT,size);
#endif
            return memoryallocatedaligned;
        }
        static inline void operator delete(void* ptr)
        {
#ifdef _IRR_WINDOWS_
            _aligned_free(ptr);
#else
            free(ptr);
#endif
        }
        static inline void* operator new[](size_t size) throw(std::bad_alloc)
        {
            void *memoryallocatedaligned = 0;
#ifdef _IRR_WINDOWS_
            memoryallocatedaligned = _aligned_malloc(size,SIMD_ALIGNMENT);
#else
            posix_memalign((void**)&memoryallocatedaligned,SIMD_ALIGNMENT,size);
#endif
            return memoryallocatedaligned;
        }
        static inline void  operator delete[](void* ptr) throw()
        {
#ifdef _IRR_WINDOWS_
            _aligned_free(ptr);
#else
            free(ptr);
#endif
        }
        static inline void* operator new(std::size_t size,void* p) throw(std::bad_alloc)
        {
            return p;
        }
        static inline void  operator delete(void* p,void* t) throw() {}
        static inline void* operator new[](std::size_t size,void* p) throw(std::bad_alloc)
        {
            return p;
        }
        static inline void  operator delete[](void* p,void* t) throw() {}
/*
        inline vectorSIMDf(const vectorSIMDu32& other);
        inline vectorSIMDf(const vectorSIMDi32& other);
        inline vectorSIMDf(const vectorSIMDu16& other);
        inline vectorSIMDf(const vectorSIMDi16& other);
**/
 
        inline vectorSIMD_32<T>& operator=(const vectorSIMD_32<T>& other) { _mm_store_si128((__m128i*)pointer,other.getAsRegister()); return *this; }
 
        //! bitwise ops
        inline vectorSIMD_32<T> operator&(const vectorSIMD_32<T>& other) {return _mm_and_si128(getAsRegister(),other.getAsRegister());}
        inline vectorSIMD_32<T> operator|(const vectorSIMD_32<T>& other) {return _mm_or_si128(getAsRegister(),other.getAsRegister());}
        inline vectorSIMD_32<T> operator^(const vectorSIMD_32<T>& other) {return _mm_xor_si128(getAsRegister(),other.getAsRegister());}
 
        //! in case you want to do your own SSE
        inline __m128i getAsRegister() const {return _mm_load_si128((__m128i*)pointer);}
 
/*
        // operators against vectors
        inline vectorSIMD_32<T> operator-() const { return _mm_xor_ps(_mm_castsi128_ps(_mm_set1_epi32(0x80000000u)),getAsRegister()); }
 
        inline vectorSIMD_32<T> operator+(const vectorSIMD_32<T>& other) const { return _mm_add_ps(other.getAsRegister(),getAsRegister()); }
        inline vectorSIMD_32<T>& operator+=(const vectorSIMD_32<T>& other) { _mm_store_ps(pointer,_mm_add_ps(other.getAsRegister(),getAsRegister())); return *this; }
 
        inline vectorSIMD_32<T> operator-(const vectorSIMD_32<T>& other) const { return _mm_sub_ps(getAsRegister(),other.getAsRegister()); }
        inline vectorSIMD_32<T>& operator-=(const vectorSIMD_32<T>& other) { _mm_store_ps(pointer,_mm_sub_ps(getAsRegister(),other.getAsRegister())); return *this; }
 
        inline vectorSIMDf operator*(const vectorSIMDf& other) const { return _mm_mul_ps(getAsRegister(),other.getAsRegister()); }
        inline vectorSIMD_32<T> operator*(const vectorSIMD_32<T>& other) const { return _mm_mul_ps(getAsRegister(),other.getAsRegister()); }
        inline vectorSIMD_32<T>& operator*=(const vectorSIMD_32<T>& other) { _mm_store_ps(pointer,_mm_mul_ps(getAsRegister(),other.getAsRegister())); return *this; }
 
        inline vectorSIMDf operator/(const vectorSIMDf& other) const { return preciseDivision(other); }
        inline vectorSIMD_32<T> operator/(const vectorSIMD_32<T>& other) const { return preciseDivision(other); }
        inline vectorSIMD_32<T>& operator/=(const vectorSIMD_32<T>& other) { (*this) = preciseDivision(other); return *this; }
 
/*
        //operators against scalars
        inline vectorSIMDf  operator+(const float &val) const { return (*this)+vectorSIMDf(val); }
        inline vectorSIMDf& operator+=(const float &val) { return ( (*this) += vectorSIMDf(val) ); }
 
        inline vectorSIMDf operator-(const float &val) const { return (*this)-vectorSIMDf(val); }
        inline vectorSIMDf& operator-=(const float &val) { return ( (*this) -= vectorSIMDf(val) ); }
 
        inline vectorSIMDf  operator*(const float &val) const { return (*this)*vectorSIMDf(val); }
        inline vectorSIMDf& operator*=(const float &val) { return ( (*this) *= vectorSIMDf(val) ); }
 
#ifdef IRRLICHT_FAST_MATH
        inline vectorSIMDf operator/(const float &v) const { return vectorSIMDf(_mm_mul_ps(_mm_rcp_ps(_mm_load_ps1(&v)),getAsRegister())); }
        inline vectorSIMDf& operator/=(const float &v) { _mm_store_ps(pointer,_mm_mul_ps(_mm_rcp_ps(_mm_load_ps1(&v)),getAsRegister())); return *this; }
#else
        inline vectorSIMDf operator/(const float &v) const { return vectorSIMDf(_mm_div_ps(getAsRegister(),_mm_load_ps1(&v))); }
        inline vectorSIMDf& operator/=(const float &v) { _mm_store_ps(pointer,_mm_div_ps(getAsRegister(),_mm_load_ps1(&v))); return *this; }
#endif
 
        //! I AM BREAKING IRRLICHT'S COMPARISON OPERATORS
        inline vector4db_SIMD operator<=(const vectorSIMDf& other) const
        {
            return _mm_cmple_ps(getAsRegister(),other.getAsRegister());
        }
        inline vector4db_SIMD operator>=(const vectorSIMDf& other) const
        {
            return _mm_cmpge_ps(getAsRegister(),other.getAsRegister());
        }
        inline vector4db_SIMD operator<(const vectorSIMDf& other) const
        {
            return _mm_cmplt_ps(getAsRegister(),other.getAsRegister());
        }
        inline vector4db_SIMD operator>(const vectorSIMDf& other) const
        {
            return _mm_cmpgt_ps(getAsRegister(),other.getAsRegister());
        }
 
        //! only the method that returns bool confirms if two vectors are exactly the same
        inline vectorSIMDf operator==(const vectorSIMDf& other) const
        {
            return _mm_cmpeq_ps(getAsRegister(),other.getAsRegister());
        }
        inline vectorSIMDf operator!=(const vectorSIMDf& other) const
        {
            return _mm_cmpneq_ps(getAsRegister(),other.getAsRegister());
        }
 
 
 
        // functions
        //! zeroes out out of range components (useful before performing a dot product so it doesnt get polluted with random values)
        //! WARNING IT DOES COST CYCLES
        inline void makeSafe2D(void) {_mm_store_ps(pointer,_mm_and_ps(_mm_load_ps(pointer),_mm_castsi128_ps(_mm_set_epi32(0,0,-1,-1))));}
        inline void makeSafe3D(void) {_mm_store_ps(pointer,_mm_and_ps(_mm_load_ps(pointer),_mm_castsi128_ps(_mm_set_epi32(0,-1,-1,-1))));}
 
        //! slightly faster than memcpy'ing into the pointers
        inline vectorSIMDf& set(float* const &array) {_mm_store_ps(pointer,_mm_loadu_ps(array)); return *this;}
        //! FASTEST WAY TO SET VALUES, Address has to be aligned to 16bytes OR WILL CRASH
        inline vectorSIMDf& set(float* const &array, bool ALIGNED) {_mm_store_ps(pointer,_mm_load_ps(array));}
        //! normal set() like vector3df's, but for different dimensional vectors
        inline vectorSIMDf& set(const float &nx, const float &ny, const float &nz, const float &nw) {_mm_store_ps(pointer,_mm_set_ps(nw,nz,ny,nx)); return *this;}
        inline vectorSIMDf& set(const float &nx, const float &ny, const float &nz) {_mm_store_ps(pointer,_mm_set_ps(0.f,nz,ny,nx)); return *this;}
        inline vectorSIMDf& set(const float &nx, const float &ny) {_mm_store_ps(pointer,_mm_set_ps(0.f,0.f,ny,nx)); return *this;}
        inline vectorSIMDf& set(const vectorSIMDf& p) {_mm_store_ps(pointer,p.getAsRegister()); return *this;}
        //! convert from vectorNdf types of irrlicht - it will read a few values past the range of the allocated memory but _mm_loadu_ps shouldnt have that kind of protection
        inline vectorSIMDf& set(const vector3df &p) {_mm_store_ps(pointer,_mm_loadu_ps(&p.X)); makeSafe3D(); return *this;}
        inline vectorSIMDf& set(const vector2df &p) {_mm_store_ps(pointer,_mm_loadu_ps(&p.X)); makeSafe2D(); return *this;}
 
        //! going directly from vectorSIMD to irrlicht types is safe cause vectorSIMDf is wider
        inline vector2df& getAsVector2df(void) const
        {
            return *((vector2df*)pointer);
        }
        inline vector3df& getAsVector3df(void) const
        {
            return *((vector3df*)pointer);
        }
 
 
        //! Get length of the vector.
        inline float getLengthAsFloat() const
        {
            __m128 xmm0 = getAsRegister();
            float result;/*
#ifdef __IRR_COMPILE_WITH_SSE4_1
            xmm0 = _mm_dp_ps(xmm0,xmm0,);
            xmm0 = _mm_sqrt_ps(xmm0);
#error "Implementation in >=SSE4.1 not ready yet"
#elif __IRR_COMPILE_WITH_SSE3*/ /*
#ifdef __IRR_COMPILE_WITH_SSE3
            xmm0 = _mm_mul_ps(xmm0,xmm0);
            xmm0 = _mm_hadd_ps(xmm0,xmm0);
            xmm0 = _mm_sqrt_ps(_mm_hadd_ps(xmm0,xmm0));
            _mm_store_ss(&result,xmm0);
            return result;
#elif defined(__IRR_COMPILE_WITH_SSE2)
            xmm0 = _mm_mul_ps(xmm0,xmm0);
            xmm0 = _mm_add_ps(xmm0,FAST_FLOAT_SHUFFLE(xmm0,_MM_SHUFFLE(0,1,2,3)));
            xmm0 = _mm_add_ps(xmm0,FAST_FLOAT_SHUFFLE(xmm0,_MM_SHUFFLE(2,3,0,1)));
            xmm0 = _mm_sqrt_ps(xmm0);
            _mm_store_ss(&result,xmm0);
            return result;
#endif
        }
        //! Useful when you have to divide a vector by another vector's length (so you dont convert/store to a scalar)
        //! all components are filled with length
        //! if you need something else, you can get the register and shuffle
        inline vectorSIMDf getLength() const
        {
            __m128 xmm0 = getAsRegister();
#ifdef __IRR_COMPILE_WITH_SSE3
            xmm0 = _mm_mul_ps(xmm0,xmm0);
            xmm0 = _mm_hadd_ps(xmm0,xmm0);
            return _mm_sqrt_ps(_mm_hadd_ps(xmm0,xmm0));
#elif defined(__IRR_COMPILE_WITH_SSE2)
            xmm0 = _mm_mul_ps(xmm0,xmm0);
            xmm0 = _mm_add_ps(xmm0,FAST_FLOAT_SHUFFLE(xmm0,_MM_SHUFFLE(0,1,2,3)));
            xmm0 = _mm_add_ps(xmm0,FAST_FLOAT_SHUFFLE(xmm0,_MM_SHUFFLE(2,3,0,1)));
            return _mm_sqrt_ps(xmm0);
#endif
        }
 
        //! Get the dot product with another vector.
        inline float dotProductAsInt(const vectorSIMDf& other) const
        {
            float result;
            __m128 xmm0 = getAsRegister();
            __m128 xmm1 = other.getAsRegister();/*
#ifdef __IRR_COMPILE_WITH_SSE4_1
            xmm0 = _mm_dp_ps(xmm0,xmm1,);
#error "Implementation in >=SSE4.1 not ready yet"
#elif __IRR_COMPILE_WITH_SSE3*/ /*
#ifdef __IRR_COMPILE_WITH_SSE3
            xmm0 = _mm_mul_ps(xmm0,xmm1);
            xmm0 = _mm_hadd_ps(xmm0,xmm0);
            xmm0 = _mm_hadd_ps(xmm0,xmm0);
            _mm_store_ss(&result,xmm0);
            return result;
#elif defined(__IRR_COMPILE_WITH_SSE2)
            xmm0 = _mm_mul_ps(xmm0,xmm1);
            xmm0 = _mm_add_ps(xmm0,FAST_FLOAT_SHUFFLE(xmm0,_MM_SHUFFLE(0,1,2,3)));
            xmm0 = _mm_add_ps(xmm0,FAST_FLOAT_SHUFFLE(xmm0,_MM_SHUFFLE(2,3,0,1)));
            _mm_store_ss(&result,xmm0);
            return result;
#endif
        }
        inline vectorSIMDf dotProduct(const vectorSIMDf& other) const
        {
            __m128 xmm0 = getAsRegister();
            __m128 xmm1 = other.getAsRegister();/*
#ifdef __IRR_COMPILE_WITH_SSE4_1
            xmm0 = _mm_dp_ps(xmm0,xmm1,);
#error "Implementation in >=SSE4.1 not ready yet"
#elif __IRR_COMPILE_WITH_SSE3*/ /*
#ifdef __IRR_COMPILE_WITH_SSE3
            xmm0 = _mm_mul_ps(xmm0,xmm1);
            xmm0 = _mm_hadd_ps(xmm0,xmm0);
            return _mm_hadd_ps(xmm0,xmm0);
#elif defined(__IRR_COMPILE_WITH_SSE2)
            xmm0 = _mm_mul_ps(xmm0,xmm1);
            xmm0 = _mm_add_ps(xmm0,FAST_FLOAT_SHUFFLE(xmm0,_MM_SHUFFLE(0,1,2,3)));
            return _mm_add_ps(xmm0,FAST_FLOAT_SHUFFLE(xmm0,_MM_SHUFFLE(2,3,0,1)));
#endif
        }
 
        //! Get squared length of the vector.
        /** This is useful because it is much faster than getLength().
        \return Squared length of the vector. *
        inline float getLengthSQAsFloat() const
        {
            float result;
            _mm_store_ss(&result,dotProduct(*this).getAsRegister());
            return result;
        }
        //! Useful when you have to divide a vector by another vector's length (so you dont convert/store to a scalar)
        inline vectorSIMDf getLengthSQ() const
        {
            return dotProduct(*this);
        }
 
 
        //! Get distance from another point.
        /** Here, the vector is interpreted as point in 3 dimensional space. *
        inline float getDistanceFromAsFloat(const vectorSIMDf& other) const
        {
            float result;
            _mm_store_ss(&result,((*this)-other).getLength().getAsRegister());
            return result;
        }
        inline vectorSIMDf getDistanceFrom(const vectorSIMDf& other) const
        {
            return ((*this)-other).getLength();
        }
 
        //! Returns squared distance from another point.
        /** Here, the vector is interpreted as point in 3 dimensional space. *
        inline uint32_t getDistanceFromSQAsFloat(const vectorSIMDf& other) const
        {
            float result;
            _mm_store_ss(&result,((*this)-other).getLengthSQ().getAsRegister());
            return result;
        }
        inline uint32_t getDistanceFromSQ(const vectorSIMDf& other) const
        {
            return ((*this)-other).getLengthSQ();
        }
 
        //! Calculates the cross product with another vector.
        /** \param p Vector to multiply with.
        \return Crossproduct of this vector with p. *
        inline vectorSIMDf crossProduct(const vectorSIMDf& p) const
        {
            __m128 xmm0 = getAsRegister();
            __m128 xmm1 = p.getAsRegister();
#ifdef __IRR_COMPILE_WITH_SSE2 //! SSE2 implementation is faster than previous SSE3 implementation
            __m128 backslash = _mm_mul_ps(FAST_FLOAT_SHUFFLE(xmm0,_MM_SHUFFLE(3,0,2,1)),FAST_FLOAT_SHUFFLE(xmm1,_MM_SHUFFLE(3,1,0,2)));
            __m128 forwardslash = _mm_mul_ps(FAST_FLOAT_SHUFFLE(xmm0,_MM_SHUFFLE(3,1,0,2)),FAST_FLOAT_SHUFFLE(xmm1,_MM_SHUFFLE(3,0,2,1)));
            return _mm_sub_ps(backslash,forwardslash); //returns 0 in the last component :D
#endif
        }
 
        //! Inverts the vector.
        inline vectorSIMDf& invert()
        {
            _mm_store_ps(pointer,_mm_xor_ps(_mm_castsi128_ps(_mm_set_epi32(0x80000000u,0x80000000u,0x80000000u,0x80000000u)),getAsRegister()));
            return *this;
        }
        //! Returns component-wise absolute value of a
        inline vectorSIMDf abs(const vectorSIMDf& a) const
        {
            return _mm_and_ps(a.getAsRegister(),_mm_castsi128_ps(_mm_set_epi32(0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF)));
        }
        //! Returns component-wise absolute value of itself
        inline vectorSIMDf getAbsoluteValue() const
        {
            return abs(*this);
        }
*/
 
#ifdef _IRR_WINDOWS_
        __declspec(align(SIMD_ALIGNMENT)) union
#else
        union
#endif
        {
            struct{
                T X; T Y; T Z; T W;
            };
            struct{
                T x; T y; T z; T w;
            };
            struct{
                T r; T g; T b; T a;
            };
            struct{
                T s; T t; T p; T q;
            };
            T pointer[4];
        };
#ifdef _IRR_WINDOWS_
    };
#else
    } __attribute__ ((__aligned__(SIMD_ALIGNMENT)));
#endif
/*
    class vectorSIMDi32 : public vectorSIMD_32<int32_t>
    {
        //! Constructor with four different values, FASTEST IF the values are constant literals
        //yes this is correct usage with _mm_set_**(), due to little endianness the thing gets set in "reverse" order
        inline explicit vectorSIMDi32(const int32_t &nx, const int32_t &ny, const int32_t &nz, const int32_t &nw) {_mm_store_si128((__m128i*)pointer,_mm_set_epi32(nw,nz,ny,nx));}
        //! 3d constructor
        inline explicit vectorSIMDi32(const int32_t &nx, const int32_t &ny, const int32_t &nz) {_mm_store_si128((__m128i*)pointer,_mm_set_epi32(0,nz,ny,nx));}
        //! 2d constructor
        inline explicit vectorSIMDi32(const int32_t &nx, const int32_t &ny) {_mm_store_si128((__m128i*)pointer,_mm_set_epi32(0,0,ny,nx));}
    };
 
    class vectorSIMDu32 : public vectorSIMD_32<uint32_t>
    {
        //! Constructor with four different values, FASTEST IF the values are constant literals
        //yes this is correct usage with _mm_set_**(), due to little endianness the thing gets set in "reverse" order
        inline explicit vectorSIMDu32(const uint32_t &nx, const uint32_t &ny, const uint32_t &nz, const uint32_t &nw) {_mm_store_si128((__m128i*)pointer,_mm_set_epi32((const int32_t&)nw,(const int32_t&)nz,(const int32_t&)ny,(const int32_t&)nx));}
        //! 3d constructor
        inline explicit vectorSIMDu32(const uint32_t &nx, const uint32_t &ny, const uint32_t &nz) {_mm_store_si128((__m128i*)pointer,_mm_set_epi32(0,(const int32_t&)nz,(const int32_t&)ny,(const int32_t&)nx));}
        //! 2d constructor
        inline explicit vectorSIMDu32(const uint32_t &nx, const uint32_t &ny) {_mm_store_si128((__m128i*)pointer,_mm_set_epi32(0,0,(const int32_t&)ny,(const int32_t&)nx));}
    };
 
/*
    inline vectorSIMDi32 mix(const vectorSIMDi32& a, const vectorSIMDi32& b, const vectorSIMDf& t)
    {
        return a+(b-a)*t;
    }
    inline vectorSIMDi32 lerp(const vectorSIMDi32& a, const vectorSIMDi32& b, const vectorSIMDf& t)
    {
        return mix(a,b,t);
    }
    template<>
    inline vectorSIMDi32 max_(const vectorSIMDi32& a, const vectorSIMDi32& b)
    {
        return _mm_max_ps(a.getAsRegister(),b.getAsRegister());
    }
    template<>
    inline vectorSIMDi32 min_(const vectorSIMDi32& a, const vectorSIMDi32& b)
    {
        return _mm_min_ps(a.getAsRegister(),b.getAsRegister());
    }
    inline vectorSIMDi32 clamp(const vectorSIMDi32& value, const vectorSIMDi32& low, const vectorSIMDi32& high)
    {
        return min_(max_(value,low),high);
    }
 
    //! Typedef for an integer 3d vector.
    typedef vectorSIMDu32 vector4du32_SIMD;
    typedef vectorSIMDu32 vector3du32_SIMD;
    typedef vectorSIMDu32 vector2du32_SIMD;
 
    typedef vectorSIMDi32 vector4di32_SIMD;
    typedef vectorSIMDi32 vector3di32_SIMD;
    typedef vectorSIMDi32 vector2di32_SIMD;
 
 
    typedef vectorSIMDu16 vector8du16_SIMD;
    typedef vectorSIMDu16 vector7du16_SIMD;
    typedef vectorSIMDu16 vector6du16_SIMD;
    typedef vectorSIMDu16 vector5du16_SIMD;
    typedef vectorSIMDu16 vector4du16_SIMD;
    typedef vectorSIMDu16 vector3du16_SIMD;
    typedef vectorSIMDu16 vector2du16_SIMD;
 
    typedef vectorSIMDi16 vector8di16_SIMD;
    typedef vectorSIMDi16 vector7di16_SIMD;
    typedef vectorSIMDi16 vector6di16_SIMD;
    typedef vectorSIMDi16 vector5di16_SIMD;
    typedef vectorSIMDi16 vector4di16_SIMD;
    typedef vectorSIMDi16 vector3di16_SIMD;
    typedef vectorSIMDi16 vector2di16_SIMD;*/
 
} // end namespace core
} // end namespace irr
 
#endif
#endif
Last edited by devsh on Fri May 01, 2015 12:15 pm, edited 4 times in total.
devsh
Competition winner
Posts: 2057
Joined: Tue Dec 09, 2008 6:00 pm
Location: UK
Contact:

Re: WANT 4x SPEEDUPS on CPU-side CODE??? SIMD IRRLICHT VECTO

Post by devsh »

Support file SIMDswizzle.h for the vectorSIMdf:
(sorry forum completely freaked out when I used the code frame)

#ifndef _SIMD_SWIZZLE_H_
#define _SIMD_SWIZZLE_H_
 
 
template <class T, class X>
class SIMD_32bitSwizzleAble
{
    template<int mask>
    inline X shuffleFunc(X reg) const;
    public:
        inline T xxxx() const {return shuffleFunc<_MM_SHUFFLE(0,0,0,0)>(((T*)this)->getAsRegister());}
        inline T xxxy() const {return shuffleFunc<_MM_SHUFFLE(1,0,0,0)>(((T*)this)->getAsRegister());}
        inline T xxxz() const {return shuffleFunc<_MM_SHUFFLE(2,0,0,0)>(((T*)this)->getAsRegister());}
        inline T xxxw() const {return shuffleFunc<_MM_SHUFFLE(3,0,0,0)>(((T*)this)->getAsRegister());}
        inline T xxyx() const {return shuffleFunc<_MM_SHUFFLE(0,1,0,0)>(((T*)this)->getAsRegister());}
        inline T xxyy() const {return shuffleFunc<_MM_SHUFFLE(1,1,0,0)>(((T*)this)->getAsRegister());}
        inline T xxyz() const {return shuffleFunc<_MM_SHUFFLE(2,1,0,0)>(((T*)this)->getAsRegister());}
        inline T xxyw() const {return shuffleFunc<_MM_SHUFFLE(3,1,0,0)>(((T*)this)->getAsRegister());}
        inline T xxzx() const {return shuffleFunc<_MM_SHUFFLE(0,2,0,0)>(((T*)this)->getAsRegister());}
        inline T xxzy() const {return shuffleFunc<_MM_SHUFFLE(1,2,0,0)>(((T*)this)->getAsRegister());}
        inline T xxzz() const {return shuffleFunc<_MM_SHUFFLE(2,2,0,0)>(((T*)this)->getAsRegister());}
        inline T xxzw() const {return shuffleFunc<_MM_SHUFFLE(3,2,0,0)>(((T*)this)->getAsRegister());}
        inline T xxwx() const {return shuffleFunc<_MM_SHUFFLE(0,3,0,0)>(((T*)this)->getAsRegister());}
        inline T xxwy() const {return shuffleFunc<_MM_SHUFFLE(1,3,0,0)>(((T*)this)->getAsRegister());}
        inline T xxwz() const {return shuffleFunc<_MM_SHUFFLE(2,3,0,0)>(((T*)this)->getAsRegister());}
        inline T xxww() const {return shuffleFunc<_MM_SHUFFLE(3,3,0,0)>(((T*)this)->getAsRegister());}
        inline T xyxx() const {return shuffleFunc<_MM_SHUFFLE(0,0,1,0)>(((T*)this)->getAsRegister());}
        inline T xyxy() const {return shuffleFunc<_MM_SHUFFLE(1,0,1,0)>(((T*)this)->getAsRegister());}
        inline T xyxz() const {return shuffleFunc<_MM_SHUFFLE(2,0,1,0)>(((T*)this)->getAsRegister());}
        inline T xyxw() const {return shuffleFunc<_MM_SHUFFLE(3,0,1,0)>(((T*)this)->getAsRegister());}
        inline T xyyx() const {return shuffleFunc<_MM_SHUFFLE(0,1,1,0)>(((T*)this)->getAsRegister());}
        inline T xyyy() const {return shuffleFunc<_MM_SHUFFLE(1,1,1,0)>(((T*)this)->getAsRegister());}
        inline T xyyz() const {return shuffleFunc<_MM_SHUFFLE(2,1,1,0)>(((T*)this)->getAsRegister());}
        inline T xyyw() const {return shuffleFunc<_MM_SHUFFLE(3,1,1,0)>(((T*)this)->getAsRegister());}
        inline T xyzx() const {return shuffleFunc<_MM_SHUFFLE(0,2,1,0)>(((T*)this)->getAsRegister());}
        inline T xyzy() const {return shuffleFunc<_MM_SHUFFLE(1,2,1,0)>(((T*)this)->getAsRegister());}
        inline T xyzz() const {return shuffleFunc<_MM_SHUFFLE(2,2,1,0)>(((T*)this)->getAsRegister());}
        inline T xyzw() const {return shuffleFunc<_MM_SHUFFLE(3,2,1,0)>(((T*)this)->getAsRegister());}
        inline T xywx() const {return shuffleFunc<_MM_SHUFFLE(0,3,1,0)>(((T*)this)->getAsRegister());}
        inline T xywy() const {return shuffleFunc<_MM_SHUFFLE(1,3,1,0)>(((T*)this)->getAsRegister());}
        inline T xywz() const {return shuffleFunc<_MM_SHUFFLE(2,3,1,0)>(((T*)this)->getAsRegister());}
        inline T xyww() const {return shuffleFunc<_MM_SHUFFLE(3,3,1,0)>(((T*)this)->getAsRegister());}
        inline T xzxx() const {return shuffleFunc<_MM_SHUFFLE(0,0,2,0)>(((T*)this)->getAsRegister());}
        inline T xzxy() const {return shuffleFunc<_MM_SHUFFLE(1,0,2,0)>(((T*)this)->getAsRegister());}
        inline T xzxz() const {return shuffleFunc<_MM_SHUFFLE(2,0,2,0)>(((T*)this)->getAsRegister());}
        inline T xzxw() const {return shuffleFunc<_MM_SHUFFLE(3,0,2,0)>(((T*)this)->getAsRegister());}
        inline T xzyx() const {return shuffleFunc<_MM_SHUFFLE(0,1,2,0)>(((T*)this)->getAsRegister());}
        inline T xzyy() const {return shuffleFunc<_MM_SHUFFLE(1,1,2,0)>(((T*)this)->getAsRegister());}
        inline T xzyz() const {return shuffleFunc<_MM_SHUFFLE(2,1,2,0)>(((T*)this)->getAsRegister());}
        inline T xzyw() const {return shuffleFunc<_MM_SHUFFLE(3,1,2,0)>(((T*)this)->getAsRegister());}
        inline T xzzx() const {return shuffleFunc<_MM_SHUFFLE(0,2,2,0)>(((T*)this)->getAsRegister());}
        inline T xzzy() const {return shuffleFunc<_MM_SHUFFLE(1,2,2,0)>(((T*)this)->getAsRegister());}
        inline T xzzz() const {return shuffleFunc<_MM_SHUFFLE(2,2,2,0)>(((T*)this)->getAsRegister());}
        inline T xzzw() const {return shuffleFunc<_MM_SHUFFLE(3,2,2,0)>(((T*)this)->getAsRegister());}
        inline T xzwx() const {return shuffleFunc<_MM_SHUFFLE(0,3,2,0)>(((T*)this)->getAsRegister());}
        inline T xzwy() const {return shuffleFunc<_MM_SHUFFLE(1,3,2,0)>(((T*)this)->getAsRegister());}
        inline T xzwz() const {return shuffleFunc<_MM_SHUFFLE(2,3,2,0)>(((T*)this)->getAsRegister());}
        inline T xzww() const {return shuffleFunc<_MM_SHUFFLE(3,3,2,0)>(((T*)this)->getAsRegister());}
        inline T xwxx() const {return shuffleFunc<_MM_SHUFFLE(0,0,3,0)>(((T*)this)->getAsRegister());}
        inline T xwxy() const {return shuffleFunc<_MM_SHUFFLE(1,0,3,0)>(((T*)this)->getAsRegister());}
        inline T xwxz() const {return shuffleFunc<_MM_SHUFFLE(2,0,3,0)>(((T*)this)->getAsRegister());}
        inline T xwxw() const {return shuffleFunc<_MM_SHUFFLE(3,0,3,0)>(((T*)this)->getAsRegister());}
        inline T xwyx() const {return shuffleFunc<_MM_SHUFFLE(0,1,3,0)>(((T*)this)->getAsRegister());}
        inline T xwyy() const {return shuffleFunc<_MM_SHUFFLE(1,1,3,0)>(((T*)this)->getAsRegister());}
        inline T xwyz() const {return shuffleFunc<_MM_SHUFFLE(2,1,3,0)>(((T*)this)->getAsRegister());}
        inline T xwyw() const {return shuffleFunc<_MM_SHUFFLE(3,1,3,0)>(((T*)this)->getAsRegister());}
        inline T xwzx() const {return shuffleFunc<_MM_SHUFFLE(0,2,3,0)>(((T*)this)->getAsRegister());}
        inline T xwzy() const {return shuffleFunc<_MM_SHUFFLE(1,2,3,0)>(((T*)this)->getAsRegister());}
        inline T xwzz() const {return shuffleFunc<_MM_SHUFFLE(2,2,3,0)>(((T*)this)->getAsRegister());}
        inline T xwzw() const {return shuffleFunc<_MM_SHUFFLE(3,2,3,0)>(((T*)this)->getAsRegister());}
        inline T xwwx() const {return shuffleFunc<_MM_SHUFFLE(0,3,3,0)>(((T*)this)->getAsRegister());}
        inline T xwwy() const {return shuffleFunc<_MM_SHUFFLE(1,3,3,0)>(((T*)this)->getAsRegister());}
        inline T xwwz() const {return shuffleFunc<_MM_SHUFFLE(2,3,3,0)>(((T*)this)->getAsRegister());}
        inline T xwww() const {return shuffleFunc<_MM_SHUFFLE(3,3,3,0)>(((T*)this)->getAsRegister());}
        inline T yxxx() const {return shuffleFunc<_MM_SHUFFLE(0,0,0,0)>(((T*)this)->getAsRegister());}
        inline T yxxy() const {return shuffleFunc<_MM_SHUFFLE(1,0,0,1)>(((T*)this)->getAsRegister());}
        inline T yxxz() const {return shuffleFunc<_MM_SHUFFLE(2,0,0,1)>(((T*)this)->getAsRegister());}
        inline T yxxw() const {return shuffleFunc<_MM_SHUFFLE(3,0,0,1)>(((T*)this)->getAsRegister());}
        inline T yxyx() const {return shuffleFunc<_MM_SHUFFLE(0,1,0,1)>(((T*)this)->getAsRegister());}
        inline T yxyy() const {return shuffleFunc<_MM_SHUFFLE(1,1,0,1)>(((T*)this)->getAsRegister());}
        inline T yxyz() const {return shuffleFunc<_MM_SHUFFLE(2,1,0,1)>(((T*)this)->getAsRegister());}
        inline T yxyw() const {return shuffleFunc<_MM_SHUFFLE(3,1,0,1)>(((T*)this)->getAsRegister());}
        inline T yxzx() const {return shuffleFunc<_MM_SHUFFLE(0,2,0,1)>(((T*)this)->getAsRegister());}
        inline T yxzy() const {return shuffleFunc<_MM_SHUFFLE(1,2,0,1)>(((T*)this)->getAsRegister());}
        inline T yxzz() const {return shuffleFunc<_MM_SHUFFLE(2,2,0,1)>(((T*)this)->getAsRegister());}
        inline T yxzw() const {return shuffleFunc<_MM_SHUFFLE(3,2,0,1)>(((T*)this)->getAsRegister());}
        inline T yxwx() const {return shuffleFunc<_MM_SHUFFLE(0,3,0,1)>(((T*)this)->getAsRegister());}
        inline T yxwy() const {return shuffleFunc<_MM_SHUFFLE(1,3,0,1)>(((T*)this)->getAsRegister());}
        inline T yxwz() const {return shuffleFunc<_MM_SHUFFLE(2,3,0,1)>(((T*)this)->getAsRegister());}
        inline T yxww() const {return shuffleFunc<_MM_SHUFFLE(3,3,0,1)>(((T*)this)->getAsRegister());}
        inline T yyxx() const {return shuffleFunc<_MM_SHUFFLE(0,0,1,1)>(((T*)this)->getAsRegister());}
        inline T yyxy() const {return shuffleFunc<_MM_SHUFFLE(1,0,1,1)>(((T*)this)->getAsRegister());}
        inline T yyxz() const {return shuffleFunc<_MM_SHUFFLE(2,0,1,1)>(((T*)this)->getAsRegister());}
        inline T yyxw() const {return shuffleFunc<_MM_SHUFFLE(3,0,1,1)>(((T*)this)->getAsRegister());}
        inline T yyyx() const {return shuffleFunc<_MM_SHUFFLE(0,1,1,1)>(((T*)this)->getAsRegister());}
        inline T yyyy() const {return shuffleFunc<_MM_SHUFFLE(1,1,1,1)>(((T*)this)->getAsRegister());}
        inline T yyyz() const {return shuffleFunc<_MM_SHUFFLE(2,1,1,1)>(((T*)this)->getAsRegister());}
        inline T yyyw() const {return shuffleFunc<_MM_SHUFFLE(3,1,1,1)>(((T*)this)->getAsRegister());}
        inline T yyzx() const {return shuffleFunc<_MM_SHUFFLE(0,2,1,1)>(((T*)this)->getAsRegister());}
        inline T yyzy() const {return shuffleFunc<_MM_SHUFFLE(1,2,1,1)>(((T*)this)->getAsRegister());}
        inline T yyzz() const {return shuffleFunc<_MM_SHUFFLE(2,2,1,1)>(((T*)this)->getAsRegister());}
        inline T yyzw() const {return shuffleFunc<_MM_SHUFFLE(3,2,1,1)>(((T*)this)->getAsRegister());}
        inline T yywx() const {return shuffleFunc<_MM_SHUFFLE(0,3,1,1)>(((T*)this)->getAsRegister());}
        inline T yywy() const {return shuffleFunc<_MM_SHUFFLE(1,3,1,1)>(((T*)this)->getAsRegister());}
        inline T yywz() const {return shuffleFunc<_MM_SHUFFLE(2,3,1,1)>(((T*)this)->getAsRegister());}
        inline T yyww() const {return shuffleFunc<_MM_SHUFFLE(3,3,1,1)>(((T*)this)->getAsRegister());}
        inline T yzxx() const {return shuffleFunc<_MM_SHUFFLE(0,0,2,1)>(((T*)this)->getAsRegister());}
        inline T yzxy() const {return shuffleFunc<_MM_SHUFFLE(1,0,2,1)>(((T*)this)->getAsRegister());}
        inline T yzxz() const {return shuffleFunc<_MM_SHUFFLE(2,0,2,1)>(((T*)this)->getAsRegister());}
        inline T yzxw() const {return shuffleFunc<_MM_SHUFFLE(3,0,2,1)>(((T*)this)->getAsRegister());}
        inline T yzyx() const {return shuffleFunc<_MM_SHUFFLE(0,1,2,1)>(((T*)this)->getAsRegister());}
        inline T yzyy() const {return shuffleFunc<_MM_SHUFFLE(1,1,2,1)>(((T*)this)->getAsRegister());}
        inline T yzyz() const {return shuffleFunc<_MM_SHUFFLE(2,1,2,1)>(((T*)this)->getAsRegister());}
        inline T yzyw() const {return shuffleFunc<_MM_SHUFFLE(3,1,2,1)>(((T*)this)->getAsRegister());}
        inline T yzzx() const {return shuffleFunc<_MM_SHUFFLE(0,2,2,1)>(((T*)this)->getAsRegister());}
        inline T yzzy() const {return shuffleFunc<_MM_SHUFFLE(1,2,2,1)>(((T*)this)->getAsRegister());}
        inline T yzzz() const {return shuffleFunc<_MM_SHUFFLE(2,2,2,1)>(((T*)this)->getAsRegister());}
        inline T yzzw() const {return shuffleFunc<_MM_SHUFFLE(3,2,2,1)>(((T*)this)->getAsRegister());}
        inline T yzwx() const {return shuffleFunc<_MM_SHUFFLE(0,3,2,1)>(((T*)this)->getAsRegister());}
        inline T yzwy() const {return shuffleFunc<_MM_SHUFFLE(1,3,2,1)>(((T*)this)->getAsRegister());}
        inline T yzwz() const {return shuffleFunc<_MM_SHUFFLE(2,3,2,1)>(((T*)this)->getAsRegister());}
        inline T yzww() const {return shuffleFunc<_MM_SHUFFLE(3,3,2,1)>(((T*)this)->getAsRegister());}
        inline T ywxx() const {return shuffleFunc<_MM_SHUFFLE(0,0,3,1)>(((T*)this)->getAsRegister());}
        inline T ywxy() const {return shuffleFunc<_MM_SHUFFLE(1,0,3,1)>(((T*)this)->getAsRegister());}
        inline T ywxz() const {return shuffleFunc<_MM_SHUFFLE(2,0,3,1)>(((T*)this)->getAsRegister());}
        inline T ywxw() const {return shuffleFunc<_MM_SHUFFLE(3,0,3,1)>(((T*)this)->getAsRegister());}
        inline T ywyx() const {return shuffleFunc<_MM_SHUFFLE(0,1,3,1)>(((T*)this)->getAsRegister());}
        inline T ywyy() const {return shuffleFunc<_MM_SHUFFLE(1,1,3,1)>(((T*)this)->getAsRegister());}
        inline T ywyz() const {return shuffleFunc<_MM_SHUFFLE(2,1,3,1)>(((T*)this)->getAsRegister());}
        inline T ywyw() const {return shuffleFunc<_MM_SHUFFLE(3,1,3,1)>(((T*)this)->getAsRegister());}
        inline T ywzx() const {return shuffleFunc<_MM_SHUFFLE(0,2,3,1)>(((T*)this)->getAsRegister());}
        inline T ywzy() const {return shuffleFunc<_MM_SHUFFLE(1,2,3,1)>(((T*)this)->getAsRegister());}
        inline T ywzz() const {return shuffleFunc<_MM_SHUFFLE(2,2,3,1)>(((T*)this)->getAsRegister());}
        inline T ywzw() const {return shuffleFunc<_MM_SHUFFLE(3,2,3,1)>(((T*)this)->getAsRegister());}
        inline T ywwx() const {return shuffleFunc<_MM_SHUFFLE(0,3,3,1)>(((T*)this)->getAsRegister());}
        inline T ywwy() const {return shuffleFunc<_MM_SHUFFLE(1,3,3,1)>(((T*)this)->getAsRegister());}
        inline T ywwz() const {return shuffleFunc<_MM_SHUFFLE(2,3,3,1)>(((T*)this)->getAsRegister());}
        inline T ywww() const {return shuffleFunc<_MM_SHUFFLE(3,3,3,1)>(((T*)this)->getAsRegister());}
        inline T zxxx() const {return shuffleFunc<_MM_SHUFFLE(0,0,0,2)>(((T*)this)->getAsRegister());}
        inline T zxxy() const {return shuffleFunc<_MM_SHUFFLE(1,0,0,2)>(((T*)this)->getAsRegister());}
        inline T zxxz() const {return shuffleFunc<_MM_SHUFFLE(2,0,0,2)>(((T*)this)->getAsRegister());}
        inline T zxxw() const {return shuffleFunc<_MM_SHUFFLE(3,0,0,2)>(((T*)this)->getAsRegister());}
        inline T zxyx() const {return shuffleFunc<_MM_SHUFFLE(0,1,0,2)>(((T*)this)->getAsRegister());}
        inline T zxyy() const {return shuffleFunc<_MM_SHUFFLE(1,1,0,2)>(((T*)this)->getAsRegister());}
        inline T zxyz() const {return shuffleFunc<_MM_SHUFFLE(2,1,0,2)>(((T*)this)->getAsRegister());}
        inline T zxyw() const {return shuffleFunc<_MM_SHUFFLE(3,1,0,2)>(((T*)this)->getAsRegister());}
        inline T zxzx() const {return shuffleFunc<_MM_SHUFFLE(0,2,0,2)>(((T*)this)->getAsRegister());}
        inline T zxzy() const {return shuffleFunc<_MM_SHUFFLE(1,2,0,2)>(((T*)this)->getAsRegister());}
        inline T zxzz() const {return shuffleFunc<_MM_SHUFFLE(2,2,0,2)>(((T*)this)->getAsRegister());}
        inline T zxzw() const {return shuffleFunc<_MM_SHUFFLE(3,2,0,2)>(((T*)this)->getAsRegister());}
        inline T zxwx() const {return shuffleFunc<_MM_SHUFFLE(0,3,0,2)>(((T*)this)->getAsRegister());}
        inline T zxwy() const {return shuffleFunc<_MM_SHUFFLE(1,3,0,2)>(((T*)this)->getAsRegister());}
        inline T zxwz() const {return shuffleFunc<_MM_SHUFFLE(2,3,0,2)>(((T*)this)->getAsRegister());}
        inline T zxww() const {return shuffleFunc<_MM_SHUFFLE(3,3,0,2)>(((T*)this)->getAsRegister());}
        inline T zyxx() const {return shuffleFunc<_MM_SHUFFLE(0,0,1,2)>(((T*)this)->getAsRegister());}
        inline T zyxy() const {return shuffleFunc<_MM_SHUFFLE(1,0,1,2)>(((T*)this)->getAsRegister());}
        inline T zyxz() const {return shuffleFunc<_MM_SHUFFLE(2,0,1,2)>(((T*)this)->getAsRegister());}
        inline T zyxw() const {return shuffleFunc<_MM_SHUFFLE(3,0,1,2)>(((T*)this)->getAsRegister());}
        inline T zyyx() const {return shuffleFunc<_MM_SHUFFLE(0,1,1,2)>(((T*)this)->getAsRegister());}
        inline T zyyy() const {return shuffleFunc<_MM_SHUFFLE(1,1,1,2)>(((T*)this)->getAsRegister());}
        inline T zyyz() const {return shuffleFunc<_MM_SHUFFLE(2,1,1,2)>(((T*)this)->getAsRegister());}
        inline T zyyw() const {return shuffleFunc<_MM_SHUFFLE(3,1,1,2)>(((T*)this)->getAsRegister());}
        inline T zyzx() const {return shuffleFunc<_MM_SHUFFLE(0,2,1,2)>(((T*)this)->getAsRegister());}
        inline T zyzy() const {return shuffleFunc<_MM_SHUFFLE(1,2,1,2)>(((T*)this)->getAsRegister());}
        inline T zyzz() const {return shuffleFunc<_MM_SHUFFLE(2,2,1,2)>(((T*)this)->getAsRegister());}
        inline T zyzw() const {return shuffleFunc<_MM_SHUFFLE(3,2,1,2)>(((T*)this)->getAsRegister());}
        inline T zywx() const {return shuffleFunc<_MM_SHUFFLE(0,3,1,2)>(((T*)this)->getAsRegister());}
        inline T zywy() const {return shuffleFunc<_MM_SHUFFLE(1,3,1,2)>(((T*)this)->getAsRegister());}
        inline T zywz() const {return shuffleFunc<_MM_SHUFFLE(2,3,1,2)>(((T*)this)->getAsRegister());}
        inline T zyww() const {return shuffleFunc<_MM_SHUFFLE(3,3,1,2)>(((T*)this)->getAsRegister());}
        inline T zzxx() const {return shuffleFunc<_MM_SHUFFLE(0,0,2,2)>(((T*)this)->getAsRegister());}
        inline T zzxy() const {return shuffleFunc<_MM_SHUFFLE(1,0,2,2)>(((T*)this)->getAsRegister());}
        inline T zzxz() const {return shuffleFunc<_MM_SHUFFLE(2,0,2,2)>(((T*)this)->getAsRegister());}
        inline T zzxw() const {return shuffleFunc<_MM_SHUFFLE(3,0,2,2)>(((T*)this)->getAsRegister());}
        inline T zzyx() const {return shuffleFunc<_MM_SHUFFLE(0,1,2,2)>(((T*)this)->getAsRegister());}
        inline T zzyy() const {return shuffleFunc<_MM_SHUFFLE(1,1,2,2)>(((T*)this)->getAsRegister());}
        inline T zzyz() const {return shuffleFunc<_MM_SHUFFLE(2,1,2,2)>(((T*)this)->getAsRegister());}
        inline T zzyw() const {return shuffleFunc<_MM_SHUFFLE(3,1,2,2)>(((T*)this)->getAsRegister());}
        inline T zzzx() const {return shuffleFunc<_MM_SHUFFLE(0,2,2,2)>(((T*)this)->getAsRegister());}
        inline T zzzy() const {return shuffleFunc<_MM_SHUFFLE(1,2,2,2)>(((T*)this)->getAsRegister());}
        inline T zzzz() const {return shuffleFunc<_MM_SHUFFLE(2,2,2,2)>(((T*)this)->getAsRegister());}
        inline T zzzw() const {return shuffleFunc<_MM_SHUFFLE(3,2,2,2)>(((T*)this)->getAsRegister());}
        inline T zzwx() const {return shuffleFunc<_MM_SHUFFLE(0,3,2,2)>(((T*)this)->getAsRegister());}
        inline T zzwy() const {return shuffleFunc<_MM_SHUFFLE(1,3,2,2)>(((T*)this)->getAsRegister());}
        inline T zzwz() const {return shuffleFunc<_MM_SHUFFLE(2,3,2,2)>(((T*)this)->getAsRegister());}
        inline T zzww() const {return shuffleFunc<_MM_SHUFFLE(3,3,2,2)>(((T*)this)->getAsRegister());}
        inline T zwxx() const {return shuffleFunc<_MM_SHUFFLE(0,0,3,2)>(((T*)this)->getAsRegister());}
        inline T zwxy() const {return shuffleFunc<_MM_SHUFFLE(1,0,3,2)>(((T*)this)->getAsRegister());}
        inline T zwxz() const {return shuffleFunc<_MM_SHUFFLE(2,0,3,2)>(((T*)this)->getAsRegister());}
        inline T zwxw() const {return shuffleFunc<_MM_SHUFFLE(3,0,3,2)>(((T*)this)->getAsRegister());}
        inline T zwyx() const {return shuffleFunc<_MM_SHUFFLE(0,1,3,2)>(((T*)this)->getAsRegister());}
        inline T zwyy() const {return shuffleFunc<_MM_SHUFFLE(1,1,3,2)>(((T*)this)->getAsRegister());}
        inline T zwyz() const {return shuffleFunc<_MM_SHUFFLE(2,1,3,2)>(((T*)this)->getAsRegister());}
        inline T zwyw() const {return shuffleFunc<_MM_SHUFFLE(3,1,3,2)>(((T*)this)->getAsRegister());}
        inline T zwzx() const {return shuffleFunc<_MM_SHUFFLE(0,2,3,2)>(((T*)this)->getAsRegister());}
        inline T zwzy() const {return shuffleFunc<_MM_SHUFFLE(1,2,3,2)>(((T*)this)->getAsRegister());}
        inline T zwzz() const {return shuffleFunc<_MM_SHUFFLE(2,2,3,2)>(((T*)this)->getAsRegister());}
        inline T zwzw() const {return shuffleFunc<_MM_SHUFFLE(3,2,3,2)>(((T*)this)->getAsRegister());}
        inline T zwwx() const {return shuffleFunc<_MM_SHUFFLE(0,3,3,2)>(((T*)this)->getAsRegister());}
        inline T zwwy() const {return shuffleFunc<_MM_SHUFFLE(1,3,3,2)>(((T*)this)->getAsRegister());}
        inline T zwwz() const {return shuffleFunc<_MM_SHUFFLE(2,3,3,2)>(((T*)this)->getAsRegister());}
        inline T zwww() const {return shuffleFunc<_MM_SHUFFLE(3,3,3,2)>(((T*)this)->getAsRegister());}
        inline T wxxx() const {return shuffleFunc<_MM_SHUFFLE(0,0,0,3)>(((T*)this)->getAsRegister());}
        inline T wxxy() const {return shuffleFunc<_MM_SHUFFLE(1,0,0,3)>(((T*)this)->getAsRegister());}
        inline T wxxz() const {return shuffleFunc<_MM_SHUFFLE(2,0,0,3)>(((T*)this)->getAsRegister());}
        inline T wxxw() const {return shuffleFunc<_MM_SHUFFLE(3,0,0,3)>(((T*)this)->getAsRegister());}
        inline T wxyx() const {return shuffleFunc<_MM_SHUFFLE(0,1,0,3)>(((T*)this)->getAsRegister());}
        inline T wxyy() const {return shuffleFunc<_MM_SHUFFLE(1,1,0,3)>(((T*)this)->getAsRegister());}
        inline T wxyz() const {return shuffleFunc<_MM_SHUFFLE(2,1,0,3)>(((T*)this)->getAsRegister());}
        inline T wxyw() const {return shuffleFunc<_MM_SHUFFLE(3,1,0,3)>(((T*)this)->getAsRegister());}
        inline T wxzx() const {return shuffleFunc<_MM_SHUFFLE(0,2,0,3)>(((T*)this)->getAsRegister());}
        inline T wxzy() const {return shuffleFunc<_MM_SHUFFLE(1,2,0,3)>(((T*)this)->getAsRegister());}
        inline T wxzz() const {return shuffleFunc<_MM_SHUFFLE(2,2,0,3)>(((T*)this)->getAsRegister());}
        inline T wxzw() const {return shuffleFunc<_MM_SHUFFLE(3,2,0,3)>(((T*)this)->getAsRegister());}
        inline T wxwx() const {return shuffleFunc<_MM_SHUFFLE(0,3,0,3)>(((T*)this)->getAsRegister());}
        inline T wxwy() const {return shuffleFunc<_MM_SHUFFLE(1,3,0,3)>(((T*)this)->getAsRegister());}
        inline T wxwz() const {return shuffleFunc<_MM_SHUFFLE(2,3,0,3)>(((T*)this)->getAsRegister());}
        inline T wxww() const {return shuffleFunc<_MM_SHUFFLE(3,3,0,3)>(((T*)this)->getAsRegister());}
        inline T wyxx() const {return shuffleFunc<_MM_SHUFFLE(0,0,1,3)>(((T*)this)->getAsRegister());}
        inline T wyxy() const {return shuffleFunc<_MM_SHUFFLE(1,0,1,3)>(((T*)this)->getAsRegister());}
        inline T wyxz() const {return shuffleFunc<_MM_SHUFFLE(2,0,1,3)>(((T*)this)->getAsRegister());}
        inline T wyxw() const {return shuffleFunc<_MM_SHUFFLE(3,0,1,3)>(((T*)this)->getAsRegister());}
        inline T wyyx() const {return shuffleFunc<_MM_SHUFFLE(0,1,1,3)>(((T*)this)->getAsRegister());}
        inline T wyyy() const {return shuffleFunc<_MM_SHUFFLE(1,1,1,3)>(((T*)this)->getAsRegister());}
        inline T wyyz() const {return shuffleFunc<_MM_SHUFFLE(2,1,1,3)>(((T*)this)->getAsRegister());}
        inline T wyyw() const {return shuffleFunc<_MM_SHUFFLE(3,1,1,3)>(((T*)this)->getAsRegister());}
        inline T wyzx() const {return shuffleFunc<_MM_SHUFFLE(0,2,1,3)>(((T*)this)->getAsRegister());}
        inline T wyzy() const {return shuffleFunc<_MM_SHUFFLE(1,2,1,3)>(((T*)this)->getAsRegister());}
        inline T wyzz() const {return shuffleFunc<_MM_SHUFFLE(2,2,1,3)>(((T*)this)->getAsRegister());}
        inline T wyzw() const {return shuffleFunc<_MM_SHUFFLE(3,2,1,3)>(((T*)this)->getAsRegister());}
        inline T wywx() const {return shuffleFunc<_MM_SHUFFLE(0,3,1,3)>(((T*)this)->getAsRegister());}
        inline T wywy() const {return shuffleFunc<_MM_SHUFFLE(1,3,1,3)>(((T*)this)->getAsRegister());}
        inline T wywz() const {return shuffleFunc<_MM_SHUFFLE(2,3,1,3)>(((T*)this)->getAsRegister());}
        inline T wyww() const {return shuffleFunc<_MM_SHUFFLE(3,3,1,3)>(((T*)this)->getAsRegister());}
        inline T wzxx() const {return shuffleFunc<_MM_SHUFFLE(0,0,2,3)>(((T*)this)->getAsRegister());}
        inline T wzxy() const {return shuffleFunc<_MM_SHUFFLE(1,0,2,3)>(((T*)this)->getAsRegister());}
        inline T wzxz() const {return shuffleFunc<_MM_SHUFFLE(2,0,2,3)>(((T*)this)->getAsRegister());}
        inline T wzxw() const {return shuffleFunc<_MM_SHUFFLE(3,0,2,3)>(((T*)this)->getAsRegister());}
        inline T wzyx() const {return shuffleFunc<_MM_SHUFFLE(0,1,2,3)>(((T*)this)->getAsRegister());}
        inline T wzyy() const {return shuffleFunc<_MM_SHUFFLE(1,1,2,3)>(((T*)this)->getAsRegister());}
        inline T wzyz() const {return shuffleFunc<_MM_SHUFFLE(2,1,2,3)>(((T*)this)->getAsRegister());}
        inline T wzyw() const {return shuffleFunc<_MM_SHUFFLE(3,1,2,3)>(((T*)this)->getAsRegister());}
        inline T wzzx() const {return shuffleFunc<_MM_SHUFFLE(0,2,2,3)>(((T*)this)->getAsRegister());}
        inline T wzzy() const {return shuffleFunc<_MM_SHUFFLE(1,2,2,3)>(((T*)this)->getAsRegister());}
        inline T wzzz() const {return shuffleFunc<_MM_SHUFFLE(2,2,2,3)>(((T*)this)->getAsRegister());}
        inline T wzzw() const {return shuffleFunc<_MM_SHUFFLE(3,2,2,3)>(((T*)this)->getAsRegister());}
        inline T wzwx() const {return shuffleFunc<_MM_SHUFFLE(0,3,2,3)>(((T*)this)->getAsRegister());}
        inline T wzwy() const {return shuffleFunc<_MM_SHUFFLE(1,3,2,3)>(((T*)this)->getAsRegister());}
        inline T wzwz() const {return shuffleFunc<_MM_SHUFFLE(2,3,2,3)>(((T*)this)->getAsRegister());}
        inline T wzww() const {return shuffleFunc<_MM_SHUFFLE(3,3,2,3)>(((T*)this)->getAsRegister());}
        inline T wwxx() const {return shuffleFunc<_MM_SHUFFLE(0,0,3,3)>(((T*)this)->getAsRegister());}
        inline T wwxy() const {return shuffleFunc<_MM_SHUFFLE(1,0,3,3)>(((T*)this)->getAsRegister());}
        inline T wwxz() const {return shuffleFunc<_MM_SHUFFLE(2,0,3,3)>(((T*)this)->getAsRegister());}
        inline T wwxw() const {return shuffleFunc<_MM_SHUFFLE(3,0,3,3)>(((T*)this)->getAsRegister());}
        inline T wwyx() const {return shuffleFunc<_MM_SHUFFLE(0,1,3,3)>(((T*)this)->getAsRegister());}
        inline T wwyy() const {return shuffleFunc<_MM_SHUFFLE(1,1,3,3)>(((T*)this)->getAsRegister());}
        inline T wwyz() const {return shuffleFunc<_MM_SHUFFLE(2,1,3,3)>(((T*)this)->getAsRegister());}
        inline T wwyw() const {return shuffleFunc<_MM_SHUFFLE(3,1,3,3)>(((T*)this)->getAsRegister());}
        inline T wwzx() const {return shuffleFunc<_MM_SHUFFLE(0,2,3,3)>(((T*)this)->getAsRegister());}
        inline T wwzy() const {return shuffleFunc<_MM_SHUFFLE(1,2,3,3)>(((T*)this)->getAsRegister());}
        inline T wwzz() const {return shuffleFunc<_MM_SHUFFLE(2,2,3,3)>(((T*)this)->getAsRegister());}
        inline T wwzw() const {return shuffleFunc<_MM_SHUFFLE(3,2,3,3)>(((T*)this)->getAsRegister());}
        inline T wwwx() const {return shuffleFunc<_MM_SHUFFLE(0,3,3,3)>(((T*)this)->getAsRegister());}
        inline T wwwy() const {return shuffleFunc<_MM_SHUFFLE(1,3,3,3)>(((T*)this)->getAsRegister());}
        inline T wwwz() const {return shuffleFunc<_MM_SHUFFLE(2,3,3,3)>(((T*)this)->getAsRegister());}
        inline T wwww() const {return shuffleFunc<_MM_SHUFFLE(3,3,3,3)>(((T*)this)->getAsRegister());}
};
 
#define FAST_FLOAT_SHUFFLE(X,Y) _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(X),Y))
 
template <>
template <int mask>
inline __m128 SIMD_32bitSwizzleAble<vectorSIMDf,__m128>::shuffleFunc(__m128 reg) const
{
    return FAST_FLOAT_SHUFFLE(reg,mask);
}
 
template <>
template <int mask>
inline __m128i SIMD_32bitSwizzleAble<vectorSIMD_32<int32_t>,__m128i>::shuffleFunc(__m128i reg) const
{
    return _mm_shuffle_epi32(reg,mask);
}
 
template <>
template <int mask>
inline __m128i SIMD_32bitSwizzleAble<vectorSIMD_32<uint32_t>,__m128i>::shuffleFunc(__m128i reg) const
{
    return _mm_shuffle_epi32(reg,mask);
}
 
#endif
 
Last edited by devsh on Fri May 01, 2015 12:34 pm, edited 4 times in total.
devsh
Competition winner
Posts: 2057
Joined: Tue Dec 09, 2008 6:00 pm
Location: UK
Contact:

Re: WANT 4x SPEEDUPS on CPU-side CODE??? SIMD IRRLICHT VECTO

Post by devsh »

matrixSIMD4.h :

Code: Select all

 
// Copyright (C) 2002-2012 Nikolaus Gebhardt
// This file is part of the "Irrlicht Engine".
// For conditions of distribution and use, see copyright notice in irrlicht.h
 
#ifndef __IRR_MATRIX_H_INCLUDED__
#define __IRR_MATRIX_H_INCLUDED__
 
#define __IRR_COMPILE_WITH_X86_SIMD_
#ifdef __IRR_COMPILE_WITH_X86_SIMD_
 
#include "matrix4.h"
#include "vectorSIMD.h"
 
 
namespace irr
{
namespace core
{
 
    //! 4x4 matrix. Mostly used as transformation matrix for 3d calculations.
    /** Translations in the 4th column, this is laid out in memory in the completely opposite way to irrlicht matrix4. */
    class matrixSIMD4
    {
        public:
 
            //! Default constructor
            /** \param constructor Choose the initialization style */
            matrixSIMD4( matrix4::eConstructor constructor = matrix4::EM4CONST_IDENTITY );
            //! Copy constructor
            /** \param other Other matrix to copy from
            \param constructor Choose the initialization style */
            matrixSIMD4(const matrixSIMD4& other, matrix4::eConstructor constructor = matrix4::EM4CONST_COPY);
            //! init from 4 row vectors
            inline matrixSIMD4(const vectorSIMDf& row0,const vectorSIMDf& row1,const vectorSIMDf& row2,const vectorSIMDf& row3)
            {
                rows[0] = row0;
                rows[1] = row1;
                rows[2] = row2;
                rows[3] = row3;
            }
            //! init from 16 floats
            inline matrixSIMD4( const float& x0,const float& y0,const float& z0,const float& w0,
                                const float& x1,const float& y1,const float& z1,const float& w1,
                                const float& x2,const float& y2,const float& z2,const float& w2,
                                const float& x3,const float& y3,const float& z3,const float& w3)
            {
                rows[0] = _mm_set_ps(w0,z0,y0,x0);
                rows[1] = _mm_set_ps(w1,z1,y1,x1);
                rows[2] = _mm_set_ps(w2,z2,y2,x2);
                rows[3] = _mm_set_ps(w3,z3,y3,x3);
            }
            //! init from 1 float
            explicit matrixSIMD4( const float& scalar)
            {
                rows[0] = _mm_set1_ps(scalar);
                rows[1] = _mm_set1_ps(scalar);
                rows[2] = _mm_set1_ps(scalar);
                rows[3] = _mm_set1_ps(scalar);
            }
            //! init from 1 float
            explicit matrixSIMD4( const matrix4& retardedIrrlichtMatrix)
            {
                __m128 xmm0 = _mm_loadu_ps(retardedIrrlichtMatrix.pointer);
                __m128 xmm1 = _mm_loadu_ps(retardedIrrlichtMatrix.pointer+4);
                __m128 xmm2 = _mm_loadu_ps(retardedIrrlichtMatrix.pointer+8);
                __m128 xmm3 = _mm_loadu_ps(retardedIrrlichtMatrix.pointer+12);
 
                _MM_TRANSPOSE4_PS(xmm0,xmm1,xmm2,xmm3);
 
                rows[0] = xmm0;
                rows[1] = xmm1;
                rows[2] = xmm2;
                rows[3] = xmm3;
            }
 
            inline matrix4 getAsRetardedIrrlichtMatrix()
            {
                __m128 xmm0 = rows[0].getAsRegister();
                __m128 xmm1 = rows[1].getAsRegister();
                __m128 xmm2 = rows[2].getAsRegister();
                __m128 xmm3 = rows[3].getAsRegister();
                _MM_TRANSPOSE4_PS(xmm0,xmm1,xmm2,xmm3)
 
#ifdef _IRR_WINDOWS_
                __declspec(align(16)) matrix4 outRIMatrix;
#else
                matrix4 outRIMatrix __attribute__ ((__aligned__(16)));
#endif
                _mm_store_ps(outRIMatrix.pointer,xmm0);
                _mm_store_ps(outRIMatrix.pointer+1,xmm1);
                _mm_store_ps(outRIMatrix.pointer+2,xmm2);
                _mm_store_ps(outRIMatrix.pointer+3,xmm3);
 
                return outRIMatrix;
            }
 
 
            //! Simple operator for directly accessing every element of the matrix.
            inline float& operator()(const s32 &row, const s32 &col)
            {
#if defined ( USE_MATRIX_TEST )
                definitelyIdentityMatrix=false;
#endif
                return rows[row].pointer[col];
            }
 
            //! Simple operator for directly accessing every element of the matrix.
            inline const float& operator()(const s32 &row, const s32 &col) const { return rows[row].pointer[col]; }
 
            //! Simple operator for linearly accessing every element of the matrix.
            inline float& operator[](u32 index)
            {
#if defined ( USE_MATRIX_TEST )
                definitelyIdentityMatrix=false;
#endif
                return ((float*)rows[0].pointer)[index];
            }
 
            //! Simple operator for linearly accessing every element of the matrix.
            inline const float& operator[](u32 index) const { return ((float*)rows[0].pointer)[index]; }
 
            //! Sets this matrix equal to the other matrix.
            matrixSIMD4& operator=(const matrixSIMD4 &other);
 
            //! Sets all elements of this matrix to the value.
            matrixSIMD4& operator=(const float& scalar);
 
            //! Returns pointer to internal array
            inline const float* pointer() const { return rows[0].pointer; }
            inline float* pointer()
            {
#if defined ( USE_MATRIX_TEST )
                definitelyIdentityMatrix=false;
#endif
                return rows[0].pointer;
            }
 
            //! Returns true if other matrix is equal to this matrix.
            inline bool operator==(const matrixSIMD4 &other) const;
 
            //! Returns true if other matrix is not equal to this matrix.
            inline bool operator!=(const matrixSIMD4 &other) const;
 
            //! Add another matrix.
            matrixSIMD4 operator+(const matrixSIMD4& other) const;
 
            //! Add another matrix.
            matrixSIMD4& operator+=(const matrixSIMD4& other);
 
            //! Subtract another matrix.
            matrixSIMD4 operator-(const matrixSIMD4& other) const;
 
            //! Subtract another matrix.
            matrixSIMD4& operator-=(const matrixSIMD4& other);
 
            //! set this matrix to the product of two matrices
            /** Calculate b*a */
            inline matrixSIMD4& setbyproduct(const matrixSIMD4& other_a,const matrixSIMD4& other_b );
 
            //! Set this matrix to the product of two matrices
            /** Calculate b*a, no optimization used,
            use it if you know you never have a identity matrix */
            matrixSIMD4& setbyproduct_nocheck(const matrixSIMD4& other_a,const matrixSIMD4& other_b );
 
            //! Multiply by another matrix.
            /** Calculate other*this */
            matrixSIMD4 operator*(const matrixSIMD4& other) const;
 
            //! Multiply by another matrix.
            /** Calculate and return other*this */
            matrixSIMD4& operator*=(const matrixSIMD4& other);
 
            //! Multiply by scalar.
            matrixSIMD4 operator*(const float& scalar) const;
 
            //! Multiply by scalar.
            matrixSIMD4& operator*=(const float& scalar);
 
            //! Set matrix to identity.
            inline matrixSIMD4& makeIdentity()
            {
                rows[0] = _mm_set_ps(0,0,0,1);
                rows[1] = _mm_set_ps(0,0,1,0);
                rows[2] = _mm_set_ps(0,1,0,0);
                rows[3] = _mm_set_ps(1,0,0,0);
#if defined ( USE_MATRIX_TEST )
                definitelyIdentityMatrix=true;
#endif
                return *this;
            }
 
 
            //! Returns true if the matrix is the identity matrix
            bool isIdentity() const;
 
            //! Returns true if the matrix is orthogonal
            inline bool isOrthogonal() const;
 
            //! Set the translation of the current matrix. Will erase any previous values.
            matrixSIMD4& setTranslation( const vectorSIMDf& translation );
 
            //! Gets the current translation
            vectorSIMDf getTranslation() const;
 
            //! Set the inverse translation of the current matrix. Will erase any previous values.
            matrixSIMD4& setInverseTranslation( const vectorSIMDf& translation );
            //! Set Scale
            matrixSIMD4& setScale( const vectorSIMDf& scale );
 
            //! Set Scale
            matrixSIMD4& setScale( const float scale ) { return setScale(_mm_set1_ps(scale)); }
 
            //! Get Scale
            core::vectorSIMDf getScale() const;
 
            //! Translate a vector by the inverse of the translation part of this matrix.
            void inverseTranslateVect( vector3df& vect ) const;
/*
            //! Rotate a vector by the inverse of the rotation part of this matrix.
            void inverseRotateVect( vector3df& vect ) const;
 
            //! Rotate a vector by the rotation part of this matrix.
            void rotateVect( vector3df& vect ) const;
 
            //! An alternate transform vector method, writing into a second vector
            void rotateVect(core::vector3df& out, const core::vector3df& in) const;
 
            //! An alternate transform vector method, writing into an array of 3 floats
            void rotateVect(float *out,const core::vector3df &in) const;
*/
            //! Transforms the vector by this matrix
            void transformVect( vector3df& vect) const;
 
            //! Transforms input vector by this matrix and stores result in output vector
            void transformVect( vector3df& out, const vector3df& in ) const;
 
            //! An alternate transform vector method, writing into an array of 4 floats
            void transformVect(float *out,const core::vector3df &in) const;
 
            //! An alternate transform vector method, reading from and writing to an array of 3 floats
            void transformVec3(float *out, const float * in) const;
 
            //! Translate a vector by the translation part of this matrix.
            void translateVect( vector3df& vect ) const;
            //! Creates a new matrix as interpolated matrix from two other ones.
            /** \param b: other matrix to interpolate with
            \param time: Must be a value between 0 and 1. */
            matrixSIMD4 interpolate(const core::matrixSIMD4& b, float factor) const;
 
            //! Gets transposed matrix
            matrixSIMD4 getTransposed() const;
 
            //! Gets transposed matrix
            inline void getTransposed( matrixSIMD4& dest ) const;
 
        private:
            //! Matrix data, stored in row-major order
            vectorSIMDf rows[4];
#if defined ( USE_MATRIX_TEST )
            //! Flag is this matrix is identity matrix
            mutable u32 definitelyIdentityMatrix;
#endif
#if defined ( USE_MATRIX_TEST_DEBUG )
            u32 id;
            mutable u32 calls;
#endif
 
    };
 
 

Code: Select all

 
    // Default constructor
    inline matrixSIMD4::matrixSIMD4( eConstructor constructor )
#if defined ( USE_MATRIX_TEST )
        : definitelyIdentityMatrix(BIT_UNTESTED)
#endif
#if defined ( USE_MATRIX_TEST_DEBUG )
        ,id ( MTest.ID++), calls ( 0 )
#endif
    {
        switch ( constructor )
        {
            case EM4CONST_NOTHING:
            case EM4CONST_COPY:
                break;
            case EM4CONST_IDENTITY:
            case EM4CONST_INVERSE:
            default:
                makeIdentity();
                break;
        }
    }
 
    // Copy constructor
    inline matrixSIMD4::matrixSIMD4( const matrixSIMD4& other, eConstructor constructor)
#if defined ( USE_MATRIX_TEST )
        : definitelyIdentityMatrix(BIT_UNTESTED)
#endif
#if defined ( USE_MATRIX_TEST_DEBUG )
        ,id ( MTest.ID++), calls ( 0 )
#endif
    {
        switch ( constructor )
        {
            case EM4CONST_IDENTITY:
                makeIdentity();
                break;
            case EM4CONST_NOTHING:
                break;
            case EM4CONST_COPY:
                *this = other;
                break;
            case EM4CONST_TRANSPOSED:
                other.getTransposed(*this);
                break;
            case EM4CONST_INVERSE:
                if (!other.getInverse(*this))
                    *this = 0.f;
                break;
            case EM4CONST_INVERSE_TRANSPOSED:
                if (!other.getInverseTransposed(*this))
                    *this = 0.f;
                else
                    *this = getTransposed();
                break;
        }
    }
 
    //! Add another matrix.
    inline matrixSIMD4 matrixSIMD4::operator+(const matrixSIMD4& other) const
    {
        matrixSIMD4 temp ( EM4CONST_NOTHING );
 
        temp.rows[0] = rows[0]+other.rows[0];
        temp.rows[1] = rows[1]+other.rows[1];
        temp.rows[2] = rows[2]+other.rows[2];
        temp.rows[3] = rows[3]+other.rows[3];
 
        return temp;
    }
 
    //! Add another matrix.
    inline matrixSIMD4& matrixSIMD4::operator+=(const matrixSIMD4& other)
    {
        rows[0] += other.rows[0];
        rows[1] += other.rows[1];
        rows[2] += other.rows[2];
        rows[3] += other.rows[3];
 
        return *this;
    }
 
    //! Subtract another matrix.
    inline matrixSIMD4 matrixSIMD4::operator-(const matrixSIMD4& other) const
    {
        matrixSIMD4 temp ( EM4CONST_NOTHING );
 
        temp.rows[0] = rows[0]-other.rows[0];
        temp.rows[1] = rows[1]-other.rows[1];
        temp.rows[2] = rows[2]-other.rows[2];
        temp.rows[3] = rows[3]-other.rows[3];
 
        return temp;
    }
 
    //! Subtract another matrix.
    inline matrixSIMD4& matrixSIMD4::operator-=(const matrixSIMD4& other)
    {
        rows[0] += other.rows[0];
        rows[1] += other.rows[1];
        rows[2] += other.rows[2];
        rows[3] += other.rows[3];
 
        return *this;
    }
 
    //! Multiply by scalar.
    inline matrixSIMD4 matrixSIMD4::operator*(const float& scalar) const
    {
        matrixSIMD4 temp ( EM4CONST_NOTHING );
 
        temp.rows[0] = rows[0]*scalar;
        temp.rows[1] = rows[1]*scalar;
        temp.rows[2] = rows[2]*scalar;
        temp.rows[3] = rows[3]*scalar;
        return temp;
    }
 
    //! Multiply by scalar.
    inline matrixSIMD4& matrixSIMD4::operator*=(const float& scalar)
    {
        rows[0] *= scalar;
        rows[1] *= scalar;
        rows[2] *= scalar;
        rows[3] *= scalar;
 
        return *this;
    }
 
    //! Multiply by another matrix.
    inline matrixSIMD4& matrixSIMD4::operator*=(const matrixSIMD4& other)
    {
#if defined ( USE_MATRIX_TEST )
        // do checks on your own in order to avoid copy creation
        if ( !other.isIdentity() )
        {
            if ( this->isIdentity() )
            {
                return (*this = other);
            }
            else
            {
                matrixSIMD4 temp ( *this );
                return setbyproduct_nocheck( temp, other );
            }
        }
        return *this;
#else
        matrixSIMD4 temp ( *this );
        return setbyproduct_nocheck( temp, other );
#endif
    }
 
    //! multiply by another matrix
    // set this matrix to the product of two other matrices
    // goal is to reduce stack use and copy
    inline matrixSIMD4& matrixSIMD4::setbyproduct_nocheck(const matrixSIMD4& other_a,const matrixSIMD4& other_b ) //A*B
    {
        // xmm4-7 will now become columuns of B
        __m128 xmm4 = other_b.rows[0].getAsRegister();
        __m128 xmm5 = other_b.rows[1].getAsRegister();
        __m128 xmm6 = other_b.rows[2].getAsRegister();
        __m128 xmm7 = other_b.rows[3].getAsRegister();
        _MM_TRANSPOSE4_PS(xmm4,xmm5,xmm6,xmm7)
 
 
        __m128 xmm0 = other_a.rows[0].getAsRegister();
        __m128 xmm1 = _mm_hadd_ps(_mm_mul_ps(xmm0,xmm4),_mm_mul_ps(xmm0,xmm5)); //(x_l,x_u,y_l,y_u)
        __m128 xmm2 = _mm_hadd_ps(_mm_mul_ps(xmm0,xmm6),_mm_mul_ps(xmm0,xmm7)); //(z_l,z_u,w_l,w_u)
        rows[0] = _mm_hadd_ps(xmm1,xmm2); //(x,y,z,w)
 
        xmm0 = other_a.rows[1].getAsRegister();
        xmm1 = _mm_hadd_ps(_mm_mul_ps(xmm0,xmm4),_mm_mul_ps(xmm0,xmm5)); //(x_l,x_u,y_l,y_u)
        xmm2 = _mm_hadd_ps(_mm_mul_ps(xmm0,xmm6),_mm_mul_ps(xmm0,xmm7)); //(z_l,z_u,w_l,w_u)
        rows[1] = _mm_hadd_ps(xmm1,xmm2); //(x,y,z,w)
 
        xmm0 = other_a.rows[2].getAsRegister();
        xmm1 = _mm_hadd_ps(_mm_mul_ps(xmm0,xmm4),_mm_mul_ps(xmm0,xmm5)); //(x_l,x_u,y_l,y_u)
        xmm2 = _mm_hadd_ps(_mm_mul_ps(xmm0,xmm6),_mm_mul_ps(xmm0,xmm7)); //(z_l,z_u,w_l,w_u)
        rows[2] = _mm_hadd_ps(xmm1,xmm2); //(x,y,z,w)
 
        xmm0 = other_a.rows[3].getAsRegister();
        xmm1 = _mm_hadd_ps(_mm_mul_ps(xmm0,xmm4),_mm_mul_ps(xmm0,xmm5)); //(x_l,x_u,y_l,y_u)
        xmm2 = _mm_hadd_ps(_mm_mul_ps(xmm0,xmm6),_mm_mul_ps(xmm0,xmm7)); //(z_l,z_u,w_l,w_u)
        rows[3] = _mm_hadd_ps(xmm1,xmm2); //(x,y,z,w)
 
#if defined ( USE_MATRIX_TEST )
        definitelyIdentityMatrix=false;
#endif
        return *this;
    }
 
 
    //! multiply by another matrix
    // set this matrix to the product of two other matrices
    // goal is to reduce stack use and copy
    inline matrixSIMD4& matrixSIMD4::setbyproduct(const matrixSIMD4& other_a, const matrixSIMD4& other_b )
    {
#if defined ( USE_MATRIX_TEST )
        if ( other_a.isIdentity () )
            return (*this = other_b);
        else
        if ( other_b.isIdentity () )
            return (*this = other_a);
        else
            return setbyproduct_nocheck(other_a,other_b);
#else
        return setbyproduct_nocheck(other_a,other_b);
#endif
    }
 
    //! multiply by another matrix
    inline matrixSIMD4 matrixSIMD4::operator*(const matrixSIMD4& m2) const
    {
#if defined ( USE_MATRIX_TEST )
        // Testing purpose..
        if ( this->isIdentity() )
            return m2;
        if ( m2.isIdentity() )
            return *this;
 
        definitelyIdentityMatrix=false;
#endif
 
 
        matrixSIMD4 m3 ( EM4CONST_NOTHING );
        return m3.setbyproduct_nocheck(*this,m2);
    }
 
 
    inline vectorSIMDf matrixSIMD4::getTranslation() const
    {
        __m128 xmm0 = _mm_unpackhi_ps(rows[0].getAsRegister(),rows[1].getAsRegister()); // (0z,1z,0w,1w)
        __m128 xmm1 = _mm_unpackhi_ps(rows[2].getAsRegister(),rows[3].getAsRegister()); // (2z,3z,2w,3w)
        __m128 xmm2 = _mm_movehl_ps(xmm1,xmm0);// (0w,1w,2w,3w)
 
        return xmm2;
    }
    inline vectorSIMDf matrixSIMD4::getTranslation3D() const
    {
        __m128 xmm0 = _mm_unpackhi_ps(rows[0].getAsRegister(),rows[1].getAsRegister()); // (0z,1z,0w,1w)
        __m128 xmm1 = _mm_unpackhi_ps(rows[2].getAsRegister(),_mm_setzero_ps()); // (2z,0,2w,0)
        __m128 xmm2 = _mm_movehl_ps(xmm1,xmm0);// (0w,1w,2w,0)
 
        return xmm2;
    }
 
 
    inline matrixSIMD4& matrixSIMD4::setTranslation( const vectorSIMDf& translation )
    {
        rows[0].W = translation.X;
        rows[1].W = translation.Y;
        rows[2].W = translation.Z;
        rows[3].W = translation.W;
 
#if defined ( USE_MATRIX_TEST )
        definitelyIdentityMatrix=false;
#endif
        return *this;
    }
    inline matrixSIMD4& matrixSIMD4::setTranslation3D( const vectorSIMDf& translation )
    {
        rows[0].W = translation.X;
        rows[1].W = translation.Y;
        rows[2].W = translation.Z;
 
#if defined ( USE_MATRIX_TEST )
        definitelyIdentityMatrix=false;
#endif
        return *this;
    }
 
 
    inline matrixSIMD4& matrixSIMD4::setInverseTranslation( const vectorSIMDf& translation )
    {
        return setTranslation(-translation);
    }
    inline matrixSIMD4& matrixSIMD4::setInverseTranslation3D( const vectorSIMDf& translation )
    {
        return setTranslation3D(-translation);
    }
 
    inline matrixSIMD4& matrixSIMD4::setScale( const vectorSIMDf& scale )
    {
        //_m128i xmm0 = _mm_castps_si128(_mm_mul_ps(scale.getAsRegister(),_mm_rsqrt_ps(getScaleSQ().getAsRegister())));
        _m128i xmm0 = _mm_castps_si128(scale.getAsRegister());
 
        _mm_maskmoveu_si128(xmm0,_mm_set_epi32(0,0,0,-1),(char*)rows);
        _mm_maskmoveu_si128(xmm0,_mm_set_epi32(0,0,-1,0),(char*)(rows+1));
        _mm_maskmoveu_si128(xmm0,_mm_set_epi32(0,-1,0,0),(char*)(rows+2));
        _mm_maskmoveu_si128(xmm0,_mm_set_epi32(-1,0,0,0),(char*)(rows+3));
#if defined ( USE_MATRIX_TEST )
        definitelyIdentityMatrix=false;
#endif
        return *this;
    }
 
    inline matrixSIMD4& matrixSIMD4::setScale3D const vectorSIMDf& scale )
    {
        //_m128i xmm0 = _mm_castps_si128(_mm_mul_ps(scale.getAsRegister(),_mm_rsqrt_ps(getScaleSQ().getAsRegister())));
        _m128i xmm0 = _mm_castps_si128(scale.getAsRegister());
 
        _mm_maskmoveu_si128(xmm0,_mm_set_epi32(0,0,0,-1),(char*)rows);
        _mm_maskmoveu_si128(xmm0,_mm_set_epi32(0,0,-1,0),(char*)(rows+1));
        _mm_maskmoveu_si128(xmm0,_mm_set_epi32(0,-1,0,0),(char*)(rows+2));
#if defined ( USE_MATRIX_TEST )
        definitelyIdentityMatrix=false;
#endif
        return *this;
    }
 
 
    //! Returns the absolute values of the scales of the matrix.
    /**
    Note that this returns the absolute (positive) values unless only scale is set.
    Unfortunately it does not appear to be possible to extract any original negative
    values. The best that we could do would be to arbitrarily make one scale
    negative if one or three of them were negative.
    FIXME - return the original values.
    */
    inline vectorSIMDf matrixSIMD4::getScaleSQ() const
    {
#ifdef __IRR_COMPILE_WITH_SSE3
        // xmm4-7 will now become columuns of B
        __m128 xmm4 = rows[0].getAsRegister();
        __m128 xmm5 = rows[1].getAsRegister();
        __m128 xmm6 = rows[2].getAsRegister();
        __m128 xmm7 = _mm_setzero_ps();
        // g==0
        __m128 xmm0 = _mm_unpacklo_ps(xmm4.xmm5);
        __m128 xmm1 = _mm_unpacklo_ps(xmm6,xmm7); // (2x,g,2y,g)
        __m128 xmm2 = _mm_unpackhi_ps(xmm4,xmm5);
        __m128 xmm3 = _mm_unpackhi_ps(xmm6,xmm7); // (2z,g,2w,g)
        xmm4 = _mm_movelh_ps(xmm1,xmm0); //(0x,1x,2x,g)
        xmm5 = _mm_movehl_ps(xmm1,xmm0);
        xmm6 = _mm_movelh_ps(xmm3,xmm2); //(0z,1z,,2z,g)
 
        // See http://www.robertblum.com/articles/2005/02/14/decomposing-matrices
        // We have to do the full calculation.
        xmm0 = _mm_mul_ps(xmm4,xmm4);// column 0 squared
        xmm1 = _mm_mul_ps(xmm5,xmm5);// column 1 squared
        xmm2 = _mm_mul_ps(xmm6,xmm6);// column 2 squared
        xmm4 = _mm_hadd_ps(xmm0,xmm1);
        xmm5 = _mm_hadd_ps(xmm2,xmm7);
        xmm6 = _mm_hadd_ps(xmm4,xmm5);
        return xmm6;
#elif defined(__IRR_COMPILE_WITH_SSE2)
#error "SSE2 version not implemented yet"
#endif
    }
    inline vectorSIMDf matrixSIMD4::getScale() const
    {
#ifdef __IRR_COMPILE_WITH_SSE3
        return getScaleSQ().getSquareRoot();
#elif defined(__IRR_COMPILE_WITH_SSE2)
#error "SSE2 version not implemented yet"
#endif
    }
 
 
    /*
        check identity with epsilon
        solve floating range problems..
    */
    inline bool matrixSIMD4::isIdentity() const
    {
#if defined ( USE_MATRIX_TEST )
        if (definitelyIdentityMatrix)
            return true;
#endif
        vector4db_SIMD tmp = (rows[0]!=vectorSIMDf(1.f,0.f,0.f,0.f))|(rows[1]!=vectorSIMDf(0.f,1.f,0.f,0.f))|(rows[2]!=vectorSIMDf(0.f,0.f,1.f,0.f))|(rows[3]!=vectorSIMDf(0.f,0.f,0.f,1.f));
 
        if (tmp.any())
            return false;
 
#if defined ( USE_MATRIX_TEST )
        definitelyIdentityMatrix=true;
#endif
        return true;
    }
 
 
    /* Check orthogonality of matrix. */
    inline bool matrixSIMD4::isOrthogonal() const
    {
        //all of the column vectors have to be orthogonal to each other
        return ((*this)*(*this).getTransposed()).isIdentity();
    }
 
 
Last edited by devsh on Fri May 01, 2015 12:41 pm, edited 2 times in total.
devsh
Competition winner
Posts: 2057
Joined: Tue Dec 09, 2008 6:00 pm
Location: UK
Contact:

Re: WANT 4x SPEEDUPS on CPU-side CODE??? SIMD IRRLICHT VECTO

Post by devsh »

part2 of matrisSIMD4.h :

Code: Select all

/*
    inline void matrixSIMD4::rotateVect( vector3df& vect ) const
    {
        vector3df tmp = vect;
        vect.X = tmp.X*M[0] + tmp.Y*M[4] + tmp.Z*M[8];
        vect.Y = tmp.X*M[1] + tmp.Y*M[5] + tmp.Z*M[9];
        vect.Z = tmp.X*M[2] + tmp.Y*M[6] + tmp.Z*M[10];
    }
 
    //! An alternate transform vector method, writing into a second vector
    inline void matrixSIMD4::rotateVect(core::vector3df& out, const core::vector3df& in) const
    {
        out.X = in.X*M[0] + in.Y*M[4] + in.Z*M[8];
        out.Y = in.X*M[1] + in.Y*M[5] + in.Z*M[9];
        out.Z = in.X*M[2] + in.Y*M[6] + in.Z*M[10];
    }
 
    //! An alternate transform vector method, writing into an array of 3 floats
    inline void matrixSIMD4::rotateVect(float *out, const core::vector3df& in) const
    {
        out[0] = in.X*M[0] + in.Y*M[4] + in.Z*M[8];
        out[1] = in.X*M[1] + in.Y*M[5] + in.Z*M[9];
        out[2] = in.X*M[2] + in.Y*M[6] + in.Z*M[10];
    }
 
    inline void matrixSIMD4::inverseRotateVect( vector3df& vect ) const
    {
        vector3df tmp = vect;
        vect.X = tmp.X*M[0] + tmp.Y*M[1] + tmp.Z*M[2];
        vect.Y = tmp.X*M[4] + tmp.Y*M[5] + tmp.Z*M[6];
        vect.Z = tmp.X*M[8] + tmp.Y*M[9] + tmp.Z*M[10];
    }
*/
 
    inline void matrixSIMD4::transformVect( vectorSIMDf& vect) const
    {
        transformVect(vect,vect);
    }
 
    inline void matrixSIMD4::transformVect( vectorSIMDf& out, const vectorSIMDf& in) const
    {
        transformVect(out.pointer,in);
    }
 
 
    inline void matrixSIMD4::transformVect(float *out, const vectorSIMDf &in) const
    {
        __m128 xmm4 = in.getAsRegister();
        __m128 xmm0 = _mm_mul_ps(rows[0].getAsRegister(),xmm4);
        __m128 xmm1 = _mm_mul_ps(rows[1].getAsRegister(),xmm4);
        __m128 xmm2 = _mm_mul_ps(rows[2].getAsRegister(),xmm4);
        __m128 xmm3 = _mm_mul_ps(rows[3].getAsRegister(),xmm4);
        xmm4 = _mm_hadd_ps(xmm2,xmm3);
        xmm2 = _mm_hadd_ps(xmm0,xmm1);
        _mm_store_ps(out,_mm_hadd_ps(xmm2,xmm4));
    }
 
/*
    //! Transforms a plane by this matrix
    inline void matrixSIMD4::transformPlane( core::plane3d<f32> &plane) const
    {
        core::plane3df temp;
        transformPlane(plane,temp);
        plane = temp;
    }
 
    //! Transforms a plane by this matrix
    inline void matrixSIMD4::transformPlane( const core::plane3d<f32> &in, core::plane3d<f32> &out) const
    {
        matrixSIMD4 transposedInverse(*this, EM4CONST_INVERSE);
        out.Normal.X = in.Normal.X*transposedInverse[0] + in.Normal.Y*transposedInverse[1] + in.Normal.Z*transposedInverse[2] + in.D*transposedInverse[3];
        out.Normal.Y = in.Normal.X*transposedInverse[4] + in.Normal.Y*transposedInverse[5] + in.Normal.Z*transposedInverse[6] + in.D*transposedInverse[7];
        out.Normal.Z = in.Normal.X*transposedInverse[8] + in.Normal.Y*transposedInverse[9] + in.Normal.Z*transposedInverse[10] + in.D*transposedInverse[11];
        out.D = in.Normal.X*transposedInverse[12] + in.Normal.Y*transposedInverse[13] + in.Normal.Z*transposedInverse[14] + in.D*transposedInverse[15];
    }
 
    //! Transforms a axis aligned bounding box
    inline void matrixSIMD4::transformBox(core::aabbox3d<f32>& box) const
    {
#if defined ( USE_MATRIX_TEST )
        if (isIdentity())
            return;
#endif
 
        transformVect(box.MinEdge);
        transformVect(box.MaxEdge);
        box.repair();
    }
 
    //! Transforms a axis aligned bounding box more accurately than transformBox()
    inline void matrixSIMD4::transformBoxEx(core::aabbox3d<f32>& box) const
    {
#if defined ( USE_MATRIX_TEST )
        if (isIdentity())
            return;
#endif
 
        const f32 Amin[3] = {box.MinEdge.X, box.MinEdge.Y, box.MinEdge.Z};
        const f32 Amax[3] = {box.MaxEdge.X, box.MaxEdge.Y, box.MaxEdge.Z};
 
        f32 Bmin[3];
        f32 Bmax[3];
 
        Bmin[0] = Bmax[0] = M[12];
        Bmin[1] = Bmax[1] = M[13];
        Bmin[2] = Bmax[2] = M[14];
 
        const matrixSIMD4 &m = *this;
 
        for (u32 i = 0; i < 3; ++i)
        {
            for (u32 j = 0; j < 3; ++j)
            {
                const f32 a = m(j,i) * Amin[j];
                const f32 b = m(j,i) * Amax[j];
 
                if (a < b)
                {
                    Bmin[i] += a;
                    Bmax[i] += b;
                }
                else
                {
                    Bmin[i] += b;
                    Bmax[i] += a;
                }
            }
        }
 
        box.MinEdge.X = Bmin[0];
        box.MinEdge.Y = Bmin[1];
        box.MinEdge.Z = Bmin[2];
 
        box.MaxEdge.X = Bmax[0];
        box.MaxEdge.Y = Bmax[1];
        box.MaxEdge.Z = Bmax[2];
    }
 
*/
    inline void matrixSIMD4::inverseTranslateVect( vectorSIMDf& vect ) const
    {
        __m128 xmm0 = _mm_unpackhi_ps(rows[0].getAsRegister(),rows[1].getAsRegister()); // (0z,1z,0w,1w)
        __m128 xmm1 = _mm_unpackhi_ps(rows[2].getAsRegister(),_mm_setzero_ps()); // (2z,3z,2w,3w)
        __m128 xmm2 = _mm_movehl_ps(xmm1,xmm0);// (0w,1w,2w,3w)
 
        vect -= xmm2;
    }
 
    inline void matrixSIMD4::translateVect( vector3df& vect ) const
    {
        __m128 xmm0 = _mm_unpackhi_ps(rows[0].getAsRegister(),rows[1].getAsRegister()); // (0z,1z,0w,1w)
        __m128 xmm1 = _mm_unpackhi_ps(rows[2].getAsRegister(),_mm_setzero_ps()); // (2z,3z,2w,3w)
        __m128 xmm2 = _mm_movehl_ps(xmm1,xmm0);// (0w,1w,2w,3w)
 
        vect += xmm2;
    }
 
 
    inline bool matrixSIMD4::getInverse(matrixSIMD4& out) const
    {
        /// Calculates the inverse of this Matrix
        /// The inverse is calculated using Cramers rule.
        /// If no inverse exists then 'false' is returned.
 
#if defined ( USE_MATRIX_TEST )
        if ( this->isIdentity() )
        {
            out=*this;
            return true;
        }
#endif
        vector4db_SIMD isReasonable = (rows[3]==vectorSIMDf(0.f,0.f,0.f,1.f));
        vectorSIMDf determinant4;
 
        if (isReasonable.all())
        {
            // last row is 0,0,0,1 like in a sane 4x4 matrix used in games
            vectorSIMDf tmpA = rows[1].zxxw()*rows[2].yzyw();// (m(1, 2) * m(2, 1)
            vectorSIMDf tmpB = rows[1].yzyw()*rows[2].zxxw();// (m(1, 1) * m(2, 2))
            __m128 tmpC = tmpA-tmpB; //1st column of out matrix
            __m128 preDeterminant = rows[0]*tmpC;
            preDeterminant = _mm_hadd_ps(preDeterminant,preDeterminant); // (x+y,z+w,..)
            determinant4 = _mm_hadd_ps(preDeterminant,preDeterminant); //
 
            if (((uint32_t*)determinant4.pointer)[0]==0.f)
                return false;
 
 
            tmpA = rows[0].zxyw()*rows[2].yzxw();
            tmpB = rows[0].yzxw()*rows[2].zxyw();
            __m128 tmpD = tmpA-tmpB; // 2nd column of out matrix
 
            tmpA = rows[0].yzxw()*rows[1].zxyw();
            tmpB = rows[0].zxyw()*rows[1].yzxw();
            __m128 tmpE = tmpA-tmpB; // 3rd column of out matrix
 
            __m128 xmm0 = tmpC;
            __m128 xmm1 = tmpD;
            __m128 xmm2 = tmpE;
            __m128 xmm3 = _mm_setzero_ps();
 
            _MM_TRANSPOSE4_PS(xmm0,xmm1,xmm2,xmm3)
 
            __m128 xmm4 = getTranslation3D().getAsRegister();
 
 
            xmm0 = _mm_mul_ps(xmm0,xmm4); //out(0,3)
            xmm1 = _mm_mul_ps(xmm1,xmm4); //out(1,3)
            xmm2 = _mm_or_ps(_mm_mul_ps(xmm2,xmm4),_mm_castsi128_ps(_mm_set_epi32(0,-1,0,-1))); //out(2,3)
 
            xmm0 = _mm_hsub_ps(xmm0,xmm1); // C.x-D.x,E.x,C.y-D.y,E.y
            xmm1 = _mm_hsub_ps(xmm2,preDeterminant); // C.z-D.z,E.z,x+y-z-w,x+Y-z-w
            xmm2 = _mm_hsub_ps(xmm0,xmm1); // C.x-D.x-E.x,C.y-D.y-E.y,C.z-D.z-E.z,0
/*
            out(0, 3) = m(0, 3) * tmpC.x +
                        m(1, 3) * -tmpD.x +
                        m(2, 3) * -tmpE.x;
 
            out(1, 3) = m(0, 3) * tmpC.y +
                        m(1, 3) * -tmpD.y +
                        m(2, 3) * -tmpE.y;
 
            out(2, 3) = m(0, 3) * -tmpC.z +
                        m(1, 3) * -tmpD.z;
                        m(2, 3) * tmpE.z;
*/
 
            _MM_TRANSPOSE4_PS(tmpC,tmpD,tmpE,xmm2)
            out.rows[0] = tmpC;
            out.rows[1] = tmpD;
            out.rows[2] = tmpE;
            out.rows[3] = xmm2;
 
            tmpA = xmm1;
            out[15] = -tmpA.w;
        }
        else
        {
            /**
            out(0, 0) = m(1, 1) * (m(2, 2) * m(3, 3) - m(2, 3) * m(3, 2)) + m(1, 2) * (m(2, 3) * m(3, 1) - m(2, 1) * m(3, 3)) + m(1, 3) * (m(2, 1) * m(3, 2) - m(2, 2) * m(3, 1)));
            out(1, 0) = m(1, 2) * (m(2, 0) * m(3, 3) - m(2, 3) * m(3, 0)) + m(1, 3) * (m(2, 2) * m(3, 0) - m(2, 0) * m(3, 2)) + m(1, 0) * (m(2, 3) * m(3, 2) - m(2, 2) * m(3, 3)));
            out(2, 0) = m(1, 3) * (m(2, 0) * m(3, 1) - m(2, 1) * m(3, 0)) + m(1, 0) * (m(2, 1) * m(3, 3) - m(2, 3) * m(3, 1)) + m(1, 1) * (m(2, 3) * m(3, 0) - m(2, 0) * m(3, 3)));
            out(3, 0) = m(1, 0) * (m(2, 2) * m(3, 1) - m(2, 1) * m(3, 2)) + m(1, 1) * (m(2, 0) * m(3, 2) - m(2, 2) * m(3, 0)) + m(1, 2) * (m(2, 1) * m(3, 0) - m(2, 0) * m(3, 1)));
 
            out(0, 1) = (m(2, 1) * (m(0, 2) * m(3, 3) - m(0, 3) * m(3, 2)) + m(2, 2) * (m(0, 3) * m(3, 1) - m(0, 1) * m(3, 3)) + m(2, 3) * (m(0, 1) * m(3, 2) - m(0, 2) * m(3, 1)));
            out(1, 1) = (m(2, 2) * (m(0, 0) * m(3, 3) - m(0, 3) * m(3, 0)) + m(2, 3) * (m(0, 2) * m(3, 0) - m(0, 0) * m(3, 2)) + m(2, 0) * (m(0, 3) * m(3, 2) - m(0, 2) * m(3, 3)));
            out(2, 1) = (m(2, 3) * (m(0, 0) * m(3, 1) - m(0, 1) * m(3, 0)) + m(2, 0) * (m(0, 1) * m(3, 3) - m(0, 3) * m(3, 1)) + m(2, 1) * (m(0, 3) * m(3, 0) - m(0, 0) * m(3, 3)));
            out(3, 1) = (m(2, 0) * (m(0, 2) * m(3, 1) - m(0, 1) * m(3, 2)) + m(2, 1) * (m(0, 0) * m(3, 2) - m(0, 2) * m(3, 0)) + m(2, 2) * (m(0, 1) * m(3, 0) - m(0, 0) * m(3, 1)));
 
            out(0, 2) = (m(3, 1) * (m(0, 2) * m(1, 3) - m(0, 3) * m(1, 2)) + m(3, 2) * (m(0, 3) * m(1, 1) - m(0, 1) * m(1, 3)) + m(3, 3) * (m(0, 1) * m(1, 2) - m(0, 2) * m(1, 1)));
            out(1, 2) = (m(3, 2) * (m(0, 0) * m(1, 3) - m(0, 3) * m(1, 0)) + m(3, 3) * (m(0, 2) * m(1, 0) - m(0, 0) * m(1, 2)) + m(3, 0) * (m(0, 3) * m(1, 2) - m(0, 2) * m(1, 3)));
            out(2, 2) = (m(3, 3) * (m(0, 0) * m(1, 1) - m(0, 1) * m(1, 0)) + m(3, 0) * (m(0, 1) * m(1, 3) - m(0, 3) * m(1, 1)) + m(3, 1) * (m(0, 3) * m(1, 0) - m(0, 0) * m(1, 3)));
            out(3, 2) = (m(3, 0) * (m(0, 2) * m(1, 1) - m(0, 1) * m(1, 2)) + m(3, 1) * (m(0, 0) * m(1, 2) - m(0, 2) * m(1, 0)) + m(3, 2) * (m(0, 1) * m(1, 0) - m(0, 0) * m(1, 1)));
 
            out(0, 3) = (m(0, 1) * (m(1, 3) * m(2, 2) - m(1, 2) * m(2, 3)) + m(0, 2) * (m(1, 1) * m(2, 3) - m(1, 3) * m(2, 1)) + m(0, 3) * (m(1, 2) * m(2, 1) - m(1, 1) * m(2, 2)));
            out(1, 3) = (m(0, 2) * (m(1, 3) * m(2, 0) - m(1, 0) * m(2, 3)) + m(0, 3) * (m(1, 0) * m(2, 2) - m(1, 2) * m(2, 0)) + m(0, 0) * (m(1, 2) * m(2, 3) - m(1, 3) * m(2, 2)));
            out(2, 3) = (m(0, 3) * (m(1, 1) * m(2, 0) - m(1, 0) * m(2, 1)) + m(0, 0) * (m(1, 3) * m(2, 1) - m(1, 1) * m(2, 3)) + m(0, 1) * (m(1, 0) * m(2, 3) - m(1, 3) * m(2, 0)));
            out(3, 3) = (m(0, 0) * (m(1, 1) * m(2, 2) - m(1, 2) * m(2, 1)) + m(0, 1) * (m(1, 2) * m(2, 0) - m(1, 0) * m(2, 2)) + m(0, 2) * (m(1, 0) * m(2, 1) - m(1, 1) * m(2, 0)));
            **/
            vectorSIMDf tmpA = rows[2].zxxz();
            vectorSIMDf tmpB = rows[3].wwyy();
            vectorSIMDf tmpC = rows[2].wwyy();
            vectorSIMDf tmpD = rows[3].zxxz();
            vectorSIMDf tmpE = rows[2].wzyx();
            vectorSIMDf tmpF = rows[3].yxwz();
            vectorSIMDf tmpG = rows[2].yxwz();
            vectorSIMDf tmpH = rows[3].wzyx();
            vectorSIMDf tmpI = rows[2].ywwy();
            vectorSIMDf tmpJ = rows[3].zzxx();
            vectorSIMDf tmpK = rows[2].zzxx();
            vectorSIMDf tmpL = rows[3].ywwy();
            __m128 xmm0 = (rows[1].yzwx()*(tmpA*tmpB-tmpC*tmpD)+rows[1].zwxy()*(tmpE*tmpF-tmpG*tmpH)+rows[1].wxyz()*(tmpI*tmpJ-tmpK*tmpL)).getAsRegister();
 
            determinant4 = rows[0].dotProduct(xmm0);
            if (((uint32_t*)determinant4.pointer)[0]==0.f)
                return false;
 
            vectorSIMDf tmpM = rows[0].zxxz();
            vectorSIMDf tmpN = rows[0].wwyy();
            vectorSIMDf tmpO = rows[0].wzyx();
            vectorSIMDf tmpP = rows[0].yxwz();
            vectorSIMDf tmpQ = rows[0].ywwy();
            vectorSIMDf tmpR = rows[0].zzxx();
            __m128 xmm1 = (rows[2].yzwx()*(tmpM*tmpB-tmpN*tmpD)+rows[2].zwxy()*(tmpO*tmpF-tmpP*tmpH)+rows[2].wxyz()*(tmpQ*tmpJ-tmpR*tmpL)).getAsRegister();
            vectorSIMDf tmpS = rows[1].wwyy();
            vectorSIMDf tmpT = rows[1].zxxz();
            vectorSIMDf tmpU = rows[1].yxwz();
            vectorSIMDf tmpV = rows[1].wzyx();
            vectorSIMDf tmpW = rows[1].zzxx();
            vectorSIMDf tmpX = rows[1].ywwy();
            __m128 xmm2 = (rows[3].yzwx()*(tmpM*tmpS-tmpN*tmpT)+rows[3].zwxy()*(tmpO*tmpU-tmpP*tmpV)+rows[3].wxyz()*(tmpQ*tmpW-tmpR*tmpX)).getAsRegister();
            __m128 xmm3 = (rows[0].yzwx()*(tmpS*tmpA-tmpT*tmpC)+rows[0].zwxy()*(tmpU*tmpE-tmpV*tmpG)+rows[0].wxyz()*(tmpW*tmpI-tmpX*tmpK)).getAsRegister();
 
 
            _MM_TRANSPOSE4_PS(xmm0,xmm1,xmm2,xmm3)
            out.rows[0] = xmm0;
            out.rows[1] = xmm1;
            out.rows[2] = xmm2;
            out.rows[3] = xmm3;
        }
 
 
        __m128 xmm0 = _mm_rcp_ps(determinant4.getAsRegister());
        out.rows[0] *= xmm0;
        out.rows[1] *= xmm0;
        out.rows[2] *= xmm0;
        out.rows[3] *= xmm0;
 
#if defined ( USE_MATRIX_TEST )
        out.definitelyIdentityMatrix = false;
#endif
        return true;
    }
 
/*
    //! Inverts a primitive matrix which only contains a translation and a rotation
    //! \param out: where result matrix is written to.
    inline bool matrixSIMD4::getInversePrimitive ( matrixSIMD4& out ) const
    {
        out.M[0 ] = M[0];
        out.M[1 ] = M[4];
        out.M[2 ] = M[8];
        out.M[3 ] = 0;
 
        out.M[4 ] = M[1];
        out.M[5 ] = M[5];
        out.M[6 ] = M[9];
        out.M[7 ] = 0;
 
        out.M[8 ] = M[2];
        out.M[9 ] = M[6];
        out.M[10] = M[10];
        out.M[11] = 0;
 
        out.M[12] = -(M[12]*M[0] + M[13]*M[1] + M[14]*M[2]);
        out.M[13] = -(M[12]*M[4] + M[13]*M[5] + M[14]*M[6]);
        out.M[14] = -(M[12]*M[8] + M[13]*M[9] + M[14]*M[10]);
        out.M[15] = 1;
 
#if defined ( USE_MATRIX_TEST )
        out.definitelyIdentityMatrix = definitelyIdentityMatrix;
#endif
        return true;
    }
 
    //!
    inline bool matrixSIMD4::makeInverse()
    {
#if defined ( USE_MATRIX_TEST )
        if (definitelyIdentityMatrix)
            return true;
#endif
        matrixSIMD4 temp ( EM4CONST_NOTHING );
 
        if (getInverse(temp))
        {
            *this = temp;
            return true;
        }
 
        return false;
    }
*/
 
    inline matrixSIMD4& matrixSIMD4::operator=(const matrixSIMD4 &other)
    {
        _mm_store_ps(rows[0].pointer,other.rows[0].getAsRegister());
        _mm_store_ps(rows[1].pointer,other.rows[1].getAsRegister());
        _mm_store_ps(rows[2].pointer,other.rows[2].getAsRegister());
        _mm_store_ps(rows[3].pointer,other.rows[3].getAsRegister());
#if defined ( USE_MATRIX_TEST )
        definitelyIdentityMatrix=other.definitelyIdentityMatrix;
#endif
        return *this;
    }
 
 
    inline matrixSIMD4& matrixSIMD4::operator=(const float& scalar)
    {
        __m128 xmm0 = _mm_load_ps1(&scalar);
        _mm_store_ps(rows[0].pointer,xmm0);
        _mm_store_ps(rows[1].pointer,xmm0);
        _mm_store_ps(rows[2].pointer,xmm0);
        _mm_store_ps(rows[3].pointer,xmm0);
 
#if defined ( USE_MATRIX_TEST )
        definitelyIdentityMatrix=false;
#endif
        return *this;
    }
 
 
    inline bool matrixSIMD4::operator==(const matrixSIMD4 &other) const
    {
#if defined ( USE_MATRIX_TEST )
        if (definitelyIdentityMatrix && other.definitelyIdentityMatrix)
            return true;
#endif
 
        return !((*this)!=other);
    }
 
 
    inline bool matrixSIMD4::operator!=(const matrixSIMD4 &other) const
    {
        return ((rows[0]!=other.rows[0])|(rows[1]!=other.rows[1])|(rows[2]!=other.rows[2])|(rows[3]!=other.rows[3])).any();
    }
 
/*
    // Builds a right-handed perspective projection matrix based on a field of view
    inline matrixSIMD4& matrixSIMD4::buildProjectionMatrixPerspectiveFovRH(
            f32 fieldOfViewRadians, f32 aspectRatio, f32 zNear, f32 zFar)
    {
        const f32 h = reciprocal(tan(fieldOfViewRadians*0.5));
        _IRR_DEBUG_BREAK_IF(aspectRatio==0.f); //divide by zero
        const float w = h / aspectRatio;
 
        _IRR_DEBUG_BREAK_IF(zNear==zFar); //divide by zero
        M[0] = w;
        M[1] = 0;
        M[2] = 0;
        M[3] = 0;
 
        M[4] = 0;
        M[5] = h;
        M[6] = 0;
        M[7] = 0;
 
        M[8] = 0;
        M[9] = 0;
        M[10] = (zFar/(zNear-zFar)); // DirectX version
//      M[10] = (zFar+zNear/(zNear-zFar)); // OpenGL version
        M[11] = -1;
 
        M[12] = 0;
        M[13] = 0;
        M[14] = (zNear*zFar/(zNear-zFar)); // DirectX version
//      M[14] = (2.0f*zNear*zFar/(zNear-zFar)); // OpenGL version
        M[15] = 0;
 
#if defined ( USE_MATRIX_TEST )
        definitelyIdentityMatrix=false;
#endif
        return *this;
    }
 
 
    // Builds a left-handed perspective projection matrix based on a field of view
    inline matrixSIMD4& matrixSIMD4::buildProjectionMatrixPerspectiveFovLH(
            f32 fieldOfViewRadians, f32 aspectRatio, f32 zNear, f32 zFar)
    {
        const f32 h = reciprocal(tan(fieldOfViewRadians*0.5));
        _IRR_DEBUG_BREAK_IF(aspectRatio==0.f); //divide by zero
        const float w = (h / aspectRatio);
 
        _IRR_DEBUG_BREAK_IF(zNear==zFar); //divide by zero
        M[0] = w;
        M[1] = 0;
        M[2] = 0;
        M[3] = 0;
 
        M[4] = 0;
        M[5] = h;
        M[6] = 0;
        M[7] = 0;
 
        M[8] = 0;
        M[9] = 0;
        M[10] = (zFar/(zFar-zNear));
        M[11] = 1;
 
        M[12] = 0;
        M[13] = 0;
        M[14] = (-zNear*zFar/(zFar-zNear));
        M[15] = 0;
 
#if defined ( USE_MATRIX_TEST )
        definitelyIdentityMatrix=false;
#endif
        return *this;
    }
 
 
    // Builds a left-handed perspective projection matrix based on a field of view, with far plane culling at infinity
    inline matrixSIMD4& matrixSIMD4::buildProjectionMatrixPerspectiveFovInfinityLH(
            f32 fieldOfViewRadians, f32 aspectRatio, f32 zNear, f32 epsilon)
    {
        const f32 h = reciprocal(tan(fieldOfViewRadians*0.5));
        _IRR_DEBUG_BREAK_IF(aspectRatio==0.f); //divide by zero
        const float w = h / aspectRatio;
 
        M[0] = w;
        M[1] = 0;
        M[2] = 0;
        M[3] = 0;
 
        M[4] = 0;
        M[5] = h;
        M[6] = 0;
        M[7] = 0;
 
        M[8] = 0;
        M[9] = 0;
        M[10] = (1.f-epsilon);
        M[11] = 1;
 
        M[12] = 0;
        M[13] = 0;
        M[14] = (zNear*(epsilon-1.f));
        M[15] = 0;
 
#if defined ( USE_MATRIX_TEST )
        definitelyIdentityMatrix=false;
#endif
        return *this;
    }
 
 
    // Builds a left-handed orthogonal projection matrix.
    inline matrixSIMD4& matrixSIMD4::buildProjectionMatrixOrthoLH(
            f32 widthOfViewVolume, f32 heightOfViewVolume, f32 zNear, f32 zFar)
    {
        _IRR_DEBUG_BREAK_IF(widthOfViewVolume==0.f); //divide by zero
        _IRR_DEBUG_BREAK_IF(heightOfViewVolume==0.f); //divide by zero
        _IRR_DEBUG_BREAK_IF(zNear==zFar); //divide by zero
        M[0] = (2/widthOfViewVolume);
        M[1] = 0;
        M[2] = 0;
        M[3] = 0;
 
        M[4] = 0;
        M[5] = (2/heightOfViewVolume);
        M[6] = 0;
        M[7] = 0;
 
        M[8] = 0;
        M[9] = 0;
        M[10] = (1/(zFar-zNear));
        M[11] = 0;
 
        M[12] = 0;
        M[13] = 0;
        M[14] = (zNear/(zNear-zFar));
        M[15] = 1;
 
#if defined ( USE_MATRIX_TEST )
        definitelyIdentityMatrix=false;
#endif
        return *this;
    }
 
 
    // Builds a right-handed orthogonal projection matrix.
    inline matrixSIMD4& matrixSIMD4::buildProjectionMatrixOrthoRH(
            f32 widthOfViewVolume, f32 heightOfViewVolume, f32 zNear, f32 zFar)
    {
        _IRR_DEBUG_BREAK_IF(widthOfViewVolume==0.f); //divide by zero
        _IRR_DEBUG_BREAK_IF(heightOfViewVolume==0.f); //divide by zero
        _IRR_DEBUG_BREAK_IF(zNear==zFar); //divide by zero
        M[0] = (2/widthOfViewVolume);
        M[1] = 0;
        M[2] = 0;
        M[3] = 0;
 
        M[4] = 0;
        M[5] = (2/heightOfViewVolume);
        M[6] = 0;
        M[7] = 0;
 
        M[8] = 0;
        M[9] = 0;
        M[10] = (1/(zNear-zFar));
        M[11] = 0;
 
        M[12] = 0;
        M[13] = 0;
        M[14] = (zNear/(zNear-zFar));
        M[15] = 1;
 
#if defined ( USE_MATRIX_TEST )
        definitelyIdentityMatrix=false;
#endif
        return *this;
    }
 
 
    // Builds a right-handed perspective projection matrix.
    inline matrixSIMD4& matrixSIMD4::buildProjectionMatrixPerspectiveRH(
            f32 widthOfViewVolume, f32 heightOfViewVolume, f32 zNear, f32 zFar)
    {
        _IRR_DEBUG_BREAK_IF(widthOfViewVolume==0.f); //divide by zero
        _IRR_DEBUG_BREAK_IF(heightOfViewVolume==0.f); //divide by zero
        _IRR_DEBUG_BREAK_IF(zNear==zFar); //divide by zero
        M[0] = (2*zNear/widthOfViewVolume);
        M[1] = 0;
        M[2] = 0;
        M[3] = 0;
 
        M[4] = 0;
        M[5] = (2*zNear/heightOfViewVolume);
        M[6] = 0;
        M[7] = 0;
 
        M[8] = 0;
        M[9] = 0;
        M[10] = (zFar/(zNear-zFar));
        M[11] = -1;
 
        M[12] = 0;
        M[13] = 0;
        M[14] = (zNear*zFar/(zNear-zFar));
        M[15] = 0;
 
#if defined ( USE_MATRIX_TEST )
        definitelyIdentityMatrix=false;
#endif
        return *this;
    }
 
 
    // Builds a left-handed perspective projection matrix.
    inline matrixSIMD4& matrixSIMD4::buildProjectionMatrixPerspectiveLH(
            f32 widthOfViewVolume, f32 heightOfViewVolume, f32 zNear, f32 zFar)
    {
        _IRR_DEBUG_BREAK_IF(widthOfViewVolume==0.f); //divide by zero
        _IRR_DEBUG_BREAK_IF(heightOfViewVolume==0.f); //divide by zero
        _IRR_DEBUG_BREAK_IF(zNear==zFar); //divide by zero
        M[0] = (2*zNear/widthOfViewVolume);
        M[1] = 0;
        M[2] = 0;
        M[3] = 0;
 
        M[4] = 0;
        M[5] = (2*zNear/heightOfViewVolume);
        M[6] = 0;
        M[7] = 0;
 
        M[8] = 0;
        M[9] = 0;
        M[10] = (zFar/(zFar-zNear));
        M[11] = 1;
 
        M[12] = 0;
        M[13] = 0;
        M[14] = (zNear*zFar/(zNear-zFar));
        M[15] = 0;
#if defined ( USE_MATRIX_TEST )
        definitelyIdentityMatrix=false;
#endif
        return *this;
    }
 
 
    // Builds a matrix that flattens geometry into a plane.
    inline matrixSIMD4& matrixSIMD4::buildShadowMatrix(const core::vector3df& light, core::plane3df plane, f32 point)
    {
        plane.Normal.normalize();
        const f32 d = plane.Normal.dotProduct(light);
 
        M[ 0] = (-plane.Normal.X * light.X + d);
        M[ 1] = (-plane.Normal.X * light.Y);
        M[ 2] = (-plane.Normal.X * light.Z);
        M[ 3] = (-plane.Normal.X * point);
 
        M[ 4] = (-plane.Normal.Y * light.X);
        M[ 5] = (-plane.Normal.Y * light.Y + d);
        M[ 6] = (-plane.Normal.Y * light.Z);
        M[ 7] = (-plane.Normal.Y * point);
 
        M[ 8] = (-plane.Normal.Z * light.X);
        M[ 9] = (-plane.Normal.Z * light.Y);
        M[10] = (-plane.Normal.Z * light.Z + d);
        M[11] = (-plane.Normal.Z * point);
 
        M[12] = (-plane.D * light.X);
        M[13] = (-plane.D * light.Y);
        M[14] = (-plane.D * light.Z);
        M[15] = (-plane.D * point + d);
#if defined ( USE_MATRIX_TEST )
        definitelyIdentityMatrix=false;
#endif
        return *this;
    }
 
    // Builds a left-handed look-at matrix.
    inline matrixSIMD4& matrixSIMD4::buildCameraLookAtMatrixLH(
                const vector3df& position,
                const vector3df& target,
                const vector3df& upVector)
    {
        vector3df zaxis = target - position;
        zaxis.normalize();
 
        vector3df xaxis = upVector.crossProduct(zaxis);
        xaxis.normalize();
 
        vector3df yaxis = zaxis.crossProduct(xaxis);
 
        M[0] = xaxis.X;
        M[1] = yaxis.X;
        M[2] = zaxis.X;
        M[3] = 0;
 
        M[4] = xaxis.Y;
        M[5] = yaxis.Y;
        M[6] = zaxis.Y;
        M[7] = 0;
 
        M[8] = xaxis.Z;
        M[9] = yaxis.Z;
        M[10] =zaxis.Z;
        M[11] = 0;
 
        M[12] = -xaxis.dotProduct(position);
        M[13] = -yaxis.dotProduct(position);
        M[14] = -zaxis.dotProduct(position);
        M[15] = 1;
#if defined ( USE_MATRIX_TEST )
        definitelyIdentityMatrix=false;
#endif
        return *this;
    }
 
 
    // Builds a right-handed look-at matrix.
    inline matrixSIMD4& matrixSIMD4::buildCameraLookAtMatrixRH(
                const vector3df& position,
                const vector3df& target,
                const vector3df& upVector)
    {
        vector3df zaxis = position - target;
        zaxis.normalize();
 
        vector3df xaxis = upVector.crossProduct(zaxis);
        xaxis.normalize();
 
        vector3df yaxis = zaxis.crossProduct(xaxis);
 
        M[0] = xaxis.X;
        M[1] = yaxis.X;
        M[2] = zaxis.X;
        M[3] = 0;
 
        M[4] = xaxis.Y;
        M[5] = yaxis.Y;
        M[6] = zaxis.Y;
        M[7] = 0;
 
        M[8] = xaxis.Z;
        M[9] = yaxis.Z;
        M[10] = zaxis.Z;
        M[11] = 0;
 
        M[12] = -xaxis.dotProduct(position);
        M[13] = -yaxis.dotProduct(position);
        M[14] = -zaxis.dotProduct(position);
        M[15] = 1;
#if defined ( USE_MATRIX_TEST )
        definitelyIdentityMatrix=false;
#endif
        return *this;
    }
*/
 
    // creates a new matrix as interpolated matrix from this and the passed one.
    inline matrixSIMD4 matrixSIMD4::interpolate(const matrixSIMD4& b, const float &factor) const
    {
        matrixSIMD4 mat ( EM4CONST_NOTHING );
 
        mat.rows[0] = vectorSIMDf::mix(this->rows[0],b.rows[0],factor);
        mat.rows[1] = vectorSIMDf::mix(this->rows[1],b.rows[1],factor);
        mat.rows[2] = vectorSIMDf::mix(this->rows[2],b.rows[2],factor);
        mat.rows[3] = vectorSIMDf::mix(this->rows[3],b.rows[3],factor);
        return mat;
    }
 
 
    // returns transposed matrix
    inline matrixSIMD4 matrixSIMD4::getTransposed() const
    {
        matrixSIMD4 t ( EM4CONST_NOTHING );
        getTransposed ( t );
        return t;
    }
 
 
    // returns transposed matrix
    inline void matrixSIMD4::getTransposed( matrixSIMD4& o ) const
    {
        __m128 xmm0 = rows[0].getAsRegister();
        __m128 xmm1 = rows[1].getAsRegister();
        __m128 xmm2 = rows[2].getAsRegister();
        __m128 xmm3 = rows[3].getAsRegister();
        _MM_TRANSPOSE4_PS(xmm0,xmm1,xmm2,xmm3)
        _mm_store_ps((float*)o.rows,xmm0);
        _mm_store_ps((float*)(o.rows+1),xmm1);
        _mm_store_ps((float*)(o.rows+2),xmm2);
        _mm_store_ps((float*)(o.rows+3),xmm3);
#if defined ( USE_MATRIX_TEST )
        o.definitelyIdentityMatrix=definitelyIdentityMatrix;
#endif
    }
 
/*
    // used to scale <-1,-1><1,1> to viewport
    inline matrixSIMD4& matrixSIMD4::buildNDCToDCMatrix( const core::rect<s32>& viewport, f32 zScale)
    {
        const f32 scaleX = (viewport.getWidth() - 0.75f ) * 0.5f;
        const f32 scaleY = -(viewport.getHeight() - 0.75f ) * 0.5f;
 
        const f32 dx = -0.5f + ( (viewport.UpperLeftCorner.X + viewport.LowerRightCorner.X ) * 0.5f );
        const f32 dy = -0.5f + ( (viewport.UpperLeftCorner.Y + viewport.LowerRightCorner.Y ) * 0.5f );
 
        makeIdentity();
        M[12] = dx;
        M[13] = dy;
        return setScale(core::vectorSIMDf(scaleX, scaleY, zScale));
    }
 
    //! Builds a matrix that rotates from one vector to another
    /** \param from: vector to rotate from
    \param to: vector to rotate to
 
        http://www.euclideanspace.com/maths/geometry/rotations/conversions/angleToMatrix/index.htm
     *
    inline matrixSIMD4& matrixSIMD4::buildRotateFromTo(const core::vector3df& from, const core::vector3df& to)
    {
        // unit vectors
        core::vector3df f(from);
        core::vector3df t(to);
        f.normalize();
        t.normalize();
 
        // axis multiplication by sin
        core::vector3df vs(t.crossProduct(f));
 
        // axis of rotation
        core::vector3df v(vs);
        v.normalize();
 
        // cosinus angle
        float ca = f.dotProduct(t);
 
        core::vector3df vt(v * (1 - ca));
 
        M[0] = vt.X * v.X + ca;
        M[5] = vt.Y * v.Y + ca;
        M[10] = vt.Z * v.Z + ca;
 
        vt.X *= v.Y;
        vt.Z *= v.X;
        vt.Y *= v.Z;
 
        M[1] = vt.X - vs.Z;
        M[2] = vt.Z + vs.Y;
        M[3] = 0;
 
        M[4] = vt.X + vs.Z;
        M[6] = vt.Y - vs.X;
        M[7] = 0;
 
        M[8] = vt.Z - vs.Y;
        M[9] = vt.Y + vs.X;
        M[11] = 0;
 
        M[12] = 0;
        M[13] = 0;
        M[14] = 0;
        M[15] = 1;
 
        return *this;
    }
 
    //! Builds a matrix which rotates a source vector to a look vector over an arbitrary axis
    /** \param camPos: viewer position in world coord
    \param center: object position in world-coord, rotation pivot
    \param translation: object final translation from center
    \param axis: axis to rotate about
    \param from: source vector to rotate from
     *
    inline void matrixSIMD4::buildAxisAlignedBillboard(
                const core::vector3df& camPos,
                const core::vector3df& center,
                const core::vector3df& translation,
                const core::vector3df& axis,
                const core::vector3df& from)
    {
        // axis of rotation
        core::vector3df up = axis;
        up.normalize();
        const core::vector3df forward = (camPos - center).normalize();
        const core::vector3df right = up.crossProduct(forward).normalize();
 
        // correct look vector
        const core::vector3df look = right.crossProduct(up);
 
        // rotate from to
        // axis multiplication by sin
        const core::vector3df vs = look.crossProduct(from);
 
        // cosinus angle
        const f32 ca = from.dotProduct(look);
 
        core::vector3df vt(up * (1.f - ca));
 
        M[0] = (vt.X * up.X + ca);
        M[5] = (vt.Y * up.Y + ca);
        M[10] =(vt.Z * up.Z + ca);
 
        vt.X *= up.Y;
        vt.Z *= up.X;
        vt.Y *= up.Z;
 
        M[1] = (vt.X - vs.Z);
        M[2] = (vt.Z + vs.Y);
        M[3] = 0;
 
        M[4] = (vt.X + vs.Z);
        M[6] = (vt.Y - vs.X);
        M[7] = 0;
 
        M[8] = (vt.Z - vs.Y);
        M[9] = (vt.Y + vs.X);
        M[11] = 0;
 
        setRotationCenter(center, translation);
    }
 
 
    //! Builds a combined matrix which translate to a center before rotation and translate afterwards
    inline void matrixSIMD4::setRotationCenter(const core::vector3df& center, const core::vector3df& translation)
    {
        M[12] = -M[0]*center.X - M[4]*center.Y - M[8]*center.Z + (center.X - translation.X );
        M[13] = -M[1]*center.X - M[5]*center.Y - M[9]*center.Z + (center.Y - translation.Y );
        M[14] = -M[2]*center.X - M[6]*center.Y - M[10]*center.Z + (center.Z - translation.Z );
        M[15] = 1.0;
#if defined ( USE_MATRIX_TEST )
        definitelyIdentityMatrix=false;
#endif
    }
 
 
 
    inline matrixSIMD4& matrixSIMD4::buildTextureTransform( f32 rotateRad,
            const core::vector2df &rotatecenter,
            const core::vector2df &translate,
            const core::vector2df &scale)
    {
        const f32 c = cosf(rotateRad);
        const f32 s = sinf(rotateRad);
 
        M[0] = (c * scale.X);
        M[1] = (s * scale.Y);
        M[2] = 0;
        M[3] = 0;
 
        M[4] = (-s * scale.X);
        M[5] = (c * scale.Y);
        M[6] = 0;
        M[7] = 0;
 
        M[8] = (c * scale.X * rotatecenter.X + -s * rotatecenter.Y + translate.X);
        M[9] = (s * scale.Y * rotatecenter.X +  c * rotatecenter.Y + translate.Y);
        M[10] = 1;
        M[11] = 0;
 
        M[12] = 0;
        M[13] = 0;
        M[14] = 0;
        M[15] = 1;
#if defined ( USE_MATRIX_TEST )
        definitelyIdentityMatrix=false;
#endif
        return *this;
    }
 
 
    // rotate about z axis, center ( 0.5, 0.5 )
    inline matrixSIMD4& matrixSIMD4::setTextureRotationCenter( f32 rotateRad )
    {
        const f32 c = cosf(rotateRad);
        const f32 s = sinf(rotateRad);
        M[0] = c;
        M[1] = s;
 
        M[4] = -s;
        M[5] = c;
 
        M[8] = (0.5f * ( s - c) + 0.5f);
        M[9] = (-0.5f * ( s + c) + 0.5f);
 
#if defined ( USE_MATRIX_TEST )
        definitelyIdentityMatrix = definitelyIdentityMatrix && (rotateRad==0.0f);
#endif
        return *this;
    }
 
 
    inline matrixSIMD4& matrixSIMD4::setTextureTranslate ( f32 x, f32 y )
    {
        M[8] = x;
        M[9] = y;
 
#if defined ( USE_MATRIX_TEST )
        definitelyIdentityMatrix = definitelyIdentityMatrix && (x==0.0f) && (y==0.0f);
#endif
        return *this;
    }
 
 
    inline matrixSIMD4& matrixSIMD4::setTextureTranslateTransposed ( f32 x, f32 y )
    {
        M[2] = x;
        M[6] = y;
 
#if defined ( USE_MATRIX_TEST )
        definitelyIdentityMatrix = definitelyIdentityMatrix && (x==0.0f) && (y==0.0f) ;
#endif
        return *this;
    }
 
    inline matrixSIMD4& matrixSIMD4::setTextureScale ( f32 sx, f32 sy )
    {
        M[0] = sx;
        M[5] = sy;
#if defined ( USE_MATRIX_TEST )
        definitelyIdentityMatrix = definitelyIdentityMatrix && (sx==1.0f) && (sy==1.0f);
#endif
        return *this;
    }
 
 
    inline matrixSIMD4& matrixSIMD4::setTextureScaleCenter( f32 sx, f32 sy )
    {
        M[0] = sx;
        M[5] = sy;
        M[8] = (0.5f - 0.5f * sx);
        M[9] = (0.5f - 0.5f * sy);
 
#if defined ( USE_MATRIX_TEST )
        definitelyIdentityMatrix = definitelyIdentityMatrix && (sx==1.0f) && (sy==1.0f);
#endif
        return *this;
    }
 
 
    // sets all matrix data members at once
    inline matrixSIMD4& matrixSIMD4::setM(const float* data)
    {
        memcpy(M,data, 16*sizeof(float));
 
#if defined ( USE_MATRIX_TEST )
        definitelyIdentityMatrix=false;
#endif
        return *this;
    }
 
 
    // sets if the matrix is definitely identity matrix
    inline void matrixSIMD4::setDefinitelyIdentityMatrix( bool isDefinitelyIdentityMatrix)
    {
#if defined ( USE_MATRIX_TEST )
        definitelyIdentityMatrix = isDefinitelyIdentityMatrix;
#endif
    }
 
 
    // gets if the matrix is definitely identity matrix
    inline bool matrixSIMD4::getDefinitelyIdentityMatrix() const
    {
#if defined ( USE_MATRIX_TEST )
        return definitelyIdentityMatrix;
#else
        return false;
#endif
    }
 
 
    //! Compare two matrices using the equal method
    inline bool matrixSIMD4::equals(const core::matrixSIMD4& other, const float tolerance) const
    {
#if defined ( USE_MATRIX_TEST )
        if (definitelyIdentityMatrix && other.definitelyIdentityMatrix)
            return true;
#endif
        for (s32 i = 0; i < 16; ++i)
            if (!core::equals(M[i],other.M[i], tolerance))
                return false;
 
        return true;
    }
 
 
    // Multiply by scalar.
    inline matrixSIMD4 operator*(const float scalar, const matrixSIMD4& mat)
    {
        return mat*scalar;
    }*/
 
 
    //! global const identity matrix
    IRRLICHT_API extern const matrixSIMD4 IdentityMatrix;
 
} // end namespace core
} // end namespace irr
 
#endif
#endif
 
Post Reply