// Copyright (C) 1987-1995 by Symantec
// Copyright (C) 2000-2011 by Digital Mars
// All Rights Reserved
// http://www.digitalmars.com
// Written by Walter Bright
/*
 * This source file is made available for personal use
 * only. The license is in /dmd/src/dmd/backendlicense.txt
 * or /dm/src/dmd/backendlicense.txt
 * For any other uses, please contact Digital Mars.
 */

#if !SPP

#include        <stdio.h>
#include        <string.h>
#include        <time.h>
#include        <math.h>
#include        "cc.h"
#include        "el.h"
#include        "oper.h"
#include        "code.h"
#include        "global.h"

static char __file__[] = __FILE__;      /* for tassert.h                */
#include        "tassert.h"

// Constants that the 8087 supports directly
// BUG: rewrite for 80 bit long doubles
#define PI              3.14159265358979323846
#define LOG2            0.30102999566398119521
#define LN2             0.6931471805599453094172321
#define LOG2T           3.32192809488736234787
#define LOG2E           1.4426950408889634074   /* 1/LN2                */

#define FWAIT   0x9B            /* FWAIT opcode                         */

/* Mark variable referenced by e as not a register candidate            */
#define notreg(e)       ((e)->EV.sp.Vsym->Sflags &= ~GTregcand)

/* Generate the appropriate ESC instruction     */
#define ESC(MF,b)       (0xD8 + ((MF) << 1) + (b))
enum MF
{       // Values for MF
        MFfloat         = 0,
        MFlong          = 1,
        MFdouble        = 2,
        MFword          = 3
};

NDP _8087elems[8];              // 8087 stack
NDP ndp_zero;

int stackused = 0;              /* number of items on the 8087 stack    */

/*********************************
 */

struct Dconst
{
    int round;
    symbol *roundto0;
    symbol *roundtonearest;
};

static Dconst oldd;

#define NDPP    0       // print out debugging info
#define NOSAHF  (I64 || config.fpxmmregs)     // can't use SAHF instruction

code *loadComplex(elem *e);
code *opmod_complex87(elem *e,regm_t *pretregs);
code *opass_complex87(elem *e,regm_t *pretregs);
code * genf2(code *c,unsigned op,unsigned rm);

#define CW_roundto0             0xFBF
#define CW_roundtonearest       0x3BF

STATIC code *genrnd(code *c, short cw);

/**********************************
 * When we need to temporarilly save 8087 registers, we record information
 * about the save into an array of NDP structs:
 */

NDP *NDP::save = NULL;
int NDP::savemax = 0;           /* # of entries in NDP::save[]          */
int NDP::savetop = 0;           /* # of entries used in NDP::save[]     */

#ifdef DEBUG
#define NDPSAVEINC 2            /* flush reallocation bugs              */
#else
#define NDPSAVEINC 8            /* allocation chunk sizes               */
#endif

/****************************************
 * Store/load to ndp save location i
 */

code *ndp_fstp(code *c, int i, tym_t ty)
{   unsigned grex = I64 ? (REX_W << 16) : 0;
    switch (tybasic(ty))
    {
        case TYfloat:
        case TYifloat:
        case TYcfloat:
            c = genc1(c,0xD9,grex | modregrm(2,3,BPRM),FLndp,i); // FSTP m32real i[BP]
            break;

        case TYdouble:
        case TYdouble_alias:
        case TYidouble:
        case TYcdouble:
            c = genc1(c,0xDD,grex | modregrm(2,3,BPRM),FLndp,i); // FSTP m64real i[BP]
            break;

        case TYldouble:
        case TYildouble:
        case TYcldouble:
            c = genc1(c,0xDB,grex | modregrm(2,7,BPRM),FLndp,i); // FSTP m80real i[BP]
            break;

        default:
            assert(0);
    }
    return c;
}

code *ndp_fld(code *c, int i, tym_t ty)
{   unsigned grex = I64 ? (REX_W << 16) : 0;
    switch (tybasic(ty))
    {
        case TYfloat:
        case TYifloat:
        case TYcfloat:
            c = genc1(c,0xD9,grex | modregrm(2,0,BPRM),FLndp,i);
            break;

        case TYdouble:
        case TYdouble_alias:
        case TYidouble:
        case TYcdouble:
            c = genc1(c,0xDD,grex | modregrm(2,0,BPRM),FLndp,i);
            break;

        case TYldouble:
        case TYildouble:
        case TYcldouble:
            c = genc1(c,0xDB,grex | modregrm(2,5,BPRM),FLndp,i); // FLD m80real i[BP]
            break;

        default:
            assert(0);
    }
    return c;
}

/**************************
 * Return index of empty slot in NDP::save[].
 */

STATIC int getemptyslot()
{       int i;

        for (i = 0; i < NDP::savemax; i++)
                if (NDP::save[i].e == NULL)
                        goto L1;
        /* Out of room, reallocate NDP::save[]  */
        NDP::save = (NDP *)mem_realloc(NDP::save,
                (NDP::savemax + NDPSAVEINC) * sizeof(*NDP::save));
        /* clear out new portion of NDP::save[] */
        memset(NDP::save + NDP::savemax,0,NDPSAVEINC * sizeof(*NDP::save));
        i = NDP::savemax;
        NDP::savemax += NDPSAVEINC;

    L1: if (i >= NDP::savetop)
                NDP::savetop = i + 1;
        return i;
}

/*********************************
 * Pop 8087 stack.
 */

#undef pop87

void pop87(
#ifdef DEBUG
        int line, const char *file
#endif
        )
#ifdef DEBUG
#define pop87() pop87(__LINE__,__FILE__)
#endif
{
        int i;

#if NDPP
        dbg_printf("pop87(%s(%d): stackused=%d)\n", file, line, stackused);
#endif
        --stackused;
        assert(stackused >= 0);
        for (i = 0; i < arraysize(_8087elems) - 1; i++)
                _8087elems[i] = _8087elems[i + 1];
        /* end of stack is nothing      */
        _8087elems[arraysize(_8087elems) - 1] = ndp_zero;
}

/*******************************
 * Push 8087 stack. Generate and return any code
 * necessary to preserve anything that might run off the end of the stack.
 */

#undef push87

#ifdef DEBUG
code *push87(int line, const char *file);
code *push87() { return push87(__LINE__,__FILE__); }
#endif

code *push87(
#ifdef DEBUG
        int line, const char *file
#endif
        )
#ifdef DEBUG
#define push87() push87(__LINE__,__FILE__)
#endif
{
        code *c;
        int i;

        c = CNIL;
        // if we would lose the top register off of the stack
        if (_8087elems[7].e != NULL)
        {
                i = getemptyslot();
                NDP::save[i] = _8087elems[7];
                c = genf2(c,0xD9,0xF6);         // FDECSTP
                c = genfwait(c);
                c = ndp_fstp(c, i, _8087elems[7].e->Ety);       // FSTP i[BP]
                assert(stackused == 8);
                if (NDPP) dbg_printf("push87() : overflow\n");
        }
        else
        {
#ifdef DEBUG
                if (NDPP) dbg_printf("push87(%s(%d): %d)\n", file, line, stackused);
#endif
                stackused++;
                assert(stackused <= 8);
        }
        // Shift the stack up
        for (i = 7; i > 0; i--)
                _8087elems[i] = _8087elems[i - 1];
        _8087elems[0] = ndp_zero;
        return c;
}

/*****************************
 * Note elem e as being in ST(i) as being a value we want to keep.
 */

#ifdef DEBUG
void note87(elem *e, unsigned offset, int i, int linnum);
void note87(elem *e, unsigned offset, int i)
{
    return note87(e, offset, i, 0);
}
void note87(elem *e, unsigned offset, int i, int linnum)
#define note87(e,offset,i) note87(e,offset,i,__LINE__)
#else
void note87(elem *e, unsigned offset, int i)
#endif
{
#if NDPP
        printf("note87(e = %p.%d, i = %d, stackused = %d, line = %d)\n",e,offset,i,stackused,linnum);
#endif
#if 0 && DEBUG
        if (_8087elems[i].e)
                printf("_8087elems[%d].e = %p\n",i,_8087elems[i].e);
#endif
        //if (i >= stackused) *(char*)0=0;
        assert(i < stackused);
        _8087elems[i].e = e;
        _8087elems[i].offset = offset;
}

/****************************************************
 * Exchange two entries in 8087 stack.
 */

void xchg87(int i, int j)
{
    NDP save;

    save = _8087elems[i];
    _8087elems[i] = _8087elems[j];
    _8087elems[j] = save;
}

/****************************
 * Make sure that elem e is in register ST(i). Reload it if necessary.
 * Input:
 *      i       0..3    8087 register number
 *      flag    1       don't bother with FXCH
 */

#ifdef DEBUG
STATIC code * makesure87(elem *e,unsigned offset,int i,unsigned flag,int linnum)
#define makesure87(e,offset,i,flag)     makesure87(e,offset,i,flag,__LINE__)
#else
STATIC code * makesure87(elem *e,unsigned offset,int i,unsigned flag)
#endif
{
        code *c;
        int j;

#ifdef DEBUG
        if (NDPP) printf("makesure87(e=%p, offset=%d, i=%d, flag=%d, line=%d)\n",e,offset,i,flag,linnum);
#endif
        assert(e && i < 4);
        c = CNIL;
    L1:
        if (_8087elems[i].e != e || _8087elems[i].offset != offset)
        {
#ifdef DEBUG
                if (_8087elems[i].e)
                    printf("_8087elems[%d].e = %p, .offset = %d\n",i,_8087elems[i].e,_8087elems[i].offset);
#endif
                assert(_8087elems[i].e == NULL);
                for (j = 0; 1; j++)
                {
                    if (j >= NDP::savetop && e->Eoper == OPcomma)
                    {
                        e = e->E2;              // try right side
                        goto L1;
                    }
#ifdef DEBUG
                    if (j >= NDP::savetop)
                        printf("e = %p, NDP::savetop = %d\n",e,NDP::savetop);
#endif
                    assert(j < NDP::savetop);
                    //printf("\tNDP::save[%d] = %p, .offset = %d\n", j, NDP::save[j].e, NDP::save[j].offset);
                    if (e == NDP::save[j].e && offset == NDP::save[j].offset)
                        break;
                }
                c = push87();
                c = genfwait(c);
                c = ndp_fld(c, j, e->Ety);              // FLD j[BP]
                if (!(flag & 1))
                {
                    while (i != 0)
                    {
                        genf2(c,0xD9,0xC8 + i);         // FXCH ST(i)
                        i--;
                    }
                }
                NDP::save[j] = ndp_zero;                // back in 8087
        }
        //_8087elems[i].e = NULL;
        return c;
}

/****************************
 * Save in memory any values in the 8087 that we want to keep.
 */

code *save87()
{
        code *c;
        int i;

        c = CNIL;
        while (_8087elems[0].e && stackused)
        {
                /* Save it      */
                i = getemptyslot();
                if (NDPP) printf("saving %p in temporary NDP::save[%d]\n",_8087elems[0].e,i);
                NDP::save[i] = _8087elems[0];

                c = genfwait(c);
                c = ndp_fstp(c,i,_8087elems[0].e->Ety); // FSTP i[BP]
                pop87();
        }
        if (c)                          /* if any stores                */
                genfwait(c);            /* wait for last one to finish  */
        return c;
}

/******************************************
 * Save any noted values that would be destroyed by n pushes
 */

code *save87regs(unsigned n)
{
    unsigned j;
    unsigned k;
    code *c = NULL;

    assert(n <= 7);
    j = 8 - n;
    if (stackused > j)
    {
        for (k = 8; k > j; k--)
        {
            c = genf2(c,0xD9,0xF6);     // FDECSTP
            c = genfwait(c);
            if (k <= stackused)
            {   int i;

                i = getemptyslot();
                c = ndp_fstp(c, i, _8087elems[k - 1].e->Ety);   // FSTP i[BP]
                NDP::save[i] = _8087elems[k - 1];
                _8087elems[k - 1] = ndp_zero;
            }
        }

        for (k = 8; k > j; k--)
        {
            if (k > stackused)
            {   c = genf2(c,0xD9,0xF7); // FINCSTP
                c = genfwait(c);
            }
        }
        stackused = j;
    }
    return c;
}

/*****************************************************
 * Save/restore ST0 or ST01
 */

void gensaverestore87(regm_t regm, code **csave, code **crestore)
{
    //printf("gensaverestore87(%s)\n", regm_str(regm));
    code *cs1 = *csave;
    code *cs2 = *crestore;
    assert(regm == mST0 || regm == mST01);

    int i = getemptyslot();
    NDP::save[i].e = el_calloc();       // this blocks slot [i] for the life of this function
    cs1 = ndp_fstp(cs1, i, TYldouble);
    cs2 = cat(ndp_fld(CNIL, i, TYldouble), cs2);
    if (regm == mST01)
    {
        int j = getemptyslot();
        NDP::save[j].e = el_calloc();
        cs1 = ndp_fstp(cs1, j, TYldouble);
        cs2 = cat(ndp_fld(CNIL, j, TYldouble), cs2);
    }
    *csave = cs1;
    *crestore = cs2;
}

/*************************************
 * Find which, if any, slot on stack holds elem e.
 */

STATIC int cse_get(elem *e, unsigned offset)
{   int i;

    for (i = 0; 1; i++)
    {
        if (i == stackused)
        {
            i = -1;
            //printf("cse not found\n");
            //elem_print(e);
            break;
        }
        if (_8087elems[i].e == e &&
            _8087elems[i].offset == offset)
        {   //printf("cse found %d\n",i);
            //elem_print(e);
            break;
        }
    }
    return i;
}

/*************************************
 * Reload common subexpression.
 */

code *comsub87(elem *e,regm_t *pretregs)
{   code *c;

    //printf("comsub87(e = %p, *pretregs = %s)\n", e, regm_str(*pretregs));
    // Look on 8087 stack
    int i = cse_get(e, 0);

    if (tycomplex(e->Ety))
    {
        unsigned sz = tysize(e->Ety);
        int j = cse_get(e, sz / 2);
        if (i >= 0 && j >= 0)
        {
            c = push87();
            c = cat(c, push87());
            c = genf2(c,0xD9,0xC0 + i);         // FLD ST(i)
            c = genf2(c,0xD9,0xC0 + j + 1);     // FLD ST(j + 1)
            c = cat(c,fixresult_complex87(e,mST01,pretregs));
        }
        else
            // Reload
            c = loaddata(e,pretregs);
    }
    else
    {
        if (i >= 0)
        {
            c = push87();
            c = genf2(c,0xD9,0xC0 + i); // FLD ST(i)
            if (*pretregs & XMMREGS)
                c = cat(c,fixresult87(e,mST0,pretregs));
            else
                c = cat(c,fixresult(e,mST0,pretregs));
        }
        else
            // Reload
            c = loaddata(e,pretregs);
    }

    freenode(e);
    return c;
}


/**************************
 * Generate code to deal with floatreg.
 */

code * genfltreg(code *c,unsigned opcode,unsigned reg,targ_size_t offset)
{
        floatreg = TRUE;
        reflocal = TRUE;
        if ((opcode & ~7) == 0xD8)
            c = genfwait(c);
        return genc1(c,opcode,modregxrm(2,reg,BPRM),FLfltreg,offset);
}

/*******************************
 * Decide if we need to gen an FWAIT.
 */

code *genfwait(code *c)
{
    if (ADDFWAIT())
        c = gen1(c,FWAIT);
    return c;
}

/***************************************
 * Generate floating point instruction.
 */

code * genf2(code *c,unsigned op,unsigned rm)
{
    return gen2(genfwait(c),op,rm);
}

/***************************
 * Put the 8087 flags into the CPU flags.
 */

STATIC code * cg87_87topsw(code *c)
{
        /* Note that SAHF is not available on some early I64 processors
         * and will cause a seg fault
         */
        c = cat(c,getregs(mAX));
        if (config.target_cpu >= TARGET_80286)
            c = genf2(c,0xDF,0xE0);             // FSTSW AX
        else
        {   c = genfltreg(c,0xD8+5,7,0);        /* FSTSW floatreg[BP]   */
            genfwait(c);                        /* FWAIT                */
            genfltreg(c,0x8A,4,1);              /* MOV AH,floatreg+1[BP] */
        }
        gen1(c,0x9E);                           // SAHF
        code_orflag(c,CFpsw);
        return c;
}

/***************************
 * Set the PSW based on the state of ST0.
 * Input:
 *      pop     if stack should be popped after test
 * Returns:
 *      start of code appended to c.
 */

STATIC code * genftst(code *c,elem *e,int pop)
{
    if (NOSAHF)
    {
        c = cat(c,push87());
        c = gen2(c,0xD9,0xEE);          // FLDZ
        gen2(c,0xDF,0xE9);              // FUCOMIP ST1
        pop87();
        if (pop)
        {   c = genf2(c,0xDD,modregrm(3,3,0));  // FPOP
            pop87();
        }
    }
    else if (config.flags4 & CFG4fastfloat)  // if fast floating point
    {
        c = genf2(c,0xD9,0xE4);         // FTST
        c = cg87_87topsw(c);            // put 8087 flags in CPU flags
        if (pop)
        {   c = genf2(c,0xDD,modregrm(3,3,0));  // FPOP
            pop87();
        }
    }
    else if (config.target_cpu >= TARGET_80386)
    {
        // FUCOMP doesn't raise exceptions on QNANs, unlike FTST
        c = cat(c,push87());
        c = gen2(c,0xD9,0xEE);          // FLDZ
        gen2(c,pop ? 0xDA : 0xDD,0xE9); // FUCOMPP / FUCOMP
        pop87();
        if (pop)
            pop87();
        cg87_87topsw(c);                // put 8087 flags in CPU flags
    }
    else
    {
        // Call library function which does not raise exceptions
        regm_t regm = 0;

        c = cat(c,callclib(e,CLIBftest,&regm,0));
        if (pop)
        {   c = genf2(c,0xDD,modregrm(3,3,0));  // FPOP
            pop87();
        }
    }
    return c;
}

/*************************************
 * Determine if there is a special 8087 instruction to load
 * constant e.
 * Input:
 *      im      0       load real part
 *              1       load imaginary part
 * Returns:
 *      opcode if found
 *      0 if not
 */

unsigned char loadconst(elem *e, int im)
#if __DMC__
__in
{
    elem_debug(e);
    assert(im == 0 || im == 1);
}
__body
#endif
{
    static float fval[7] =
        {0.0,1.0,PI,LOG2T,LOG2E,LOG2,LN2};
    static double dval[7] =
        {0.0,1.0,PI,LOG2T,LOG2E,LOG2,LN2};
    static longdouble ldval[7] =
#if __DMC__    // from math.h
    {0.0,1.0,M_PI_L,M_LOG2T_L,M_LOG2E_L,M_LOG2_L,M_LN2_L};
#elif _MSC_VER // struct longdouble constants
    {ld_zero, ld_one, ld_pi, ld_log2t, ld_log2e, ld_log2, ld_ln2};
#else          // C99 hexadecimal floats (GCC, CLANG, ...)
#define M_PI_L          0x1.921fb54442d1846ap+1L        // 3.14159 fldpi
#define M_LOG2T_L       0x1.a934f0979a3715fcp+1L        // 3.32193 fldl2t
#define M_LOG2E_L       0x1.71547652b82fe178p+0L        // 1.4427 fldl2e
#define M_LOG2_L        0x1.34413509f79fef32p-2L        // 0.30103 fldlg2
#define M_LN2_L         0x1.62e42fefa39ef358p-1L        // 0.693147 fldln2
    {0.0,1.0,M_PI_L,M_LOG2T_L,M_LOG2E_L,M_LOG2_L,M_LN2_L};
#endif
    static char opcode[7 + 1] =
        /* FLDZ,FLD1,FLDPI,FLDL2T,FLDL2E,FLDLG2,FLDLN2,0 */
        {0xEE,0xE8,0xEB,0xE9,0xEA,0xEC,0xED,0};
    int i;
    targ_float f;
    targ_double d;
    targ_ldouble ld;
    int sz;
    int zero;
    void *p;
    static char zeros[sizeof(longdouble)];

    if (im == 0)
    {
        switch (tybasic(e->Ety))
        {
            case TYfloat:
            case TYifloat:
            case TYcfloat:
                f = e->EV.Vfloat;
                sz = 4;
                p = &f;
                break;

            case TYdouble:
            case TYdouble_alias:
            case TYidouble:
            case TYcdouble:
                d = e->EV.Vdouble;
                sz = 8;
                p = &d;
                break;

            case TYldouble:
            case TYildouble:
            case TYcldouble:
                ld = e->EV.Vldouble;
                sz = 10;
                p = &ld;
                break;

            default:
                assert(0);
        }
    }
    else
    {
        switch (tybasic(e->Ety))
        {
            case TYcfloat:
                f = e->EV.Vcfloat.im;
                sz = 4;
                p = &f;
                break;

            case TYcdouble:
                d = e->EV.Vcdouble.im;
                sz = 8;
                p = &d;
                break;

            case TYcldouble:
                ld = e->EV.Vcldouble.im;
                sz = 10;
                p = &ld;
                break;

            default:
                assert(0);
        }
    }

    // Note that for this purpose, -0 is not regarded as +0,
    // since FLDZ loads a +0
    zero = (memcmp(p, zeros, sz) == 0);
    if (zero && config.target_cpu >= TARGET_PentiumPro)
        return 0xEE;            // FLDZ is the only one with 1 micro-op

    // For some reason, these instructions take more clocks
    if (config.flags4 & CFG4speed && config.target_cpu >= TARGET_Pentium)
        return 0;

    if (zero)
        return 0xEE;

    for (i = 1; i < arraysize(fval); i++)
    {
        switch (sz)
        {
            case 4:
                if (fval[i] != f)
                    continue;
                break;
            case 8:
                if (dval[i] != d)
                    continue;
                break;
            case 10:
                if (ldval[i] != ld)
                    continue;
                break;
            default:
                assert(0);
        }
        break;
    }
    return opcode[i];
}

/******************************
 * Given the result of an expression is in retregs,
 * generate necessary code to return result in *pretregs.
 */


code *fixresult87(elem *e,regm_t retregs,regm_t *pretregs)
{
    regm_t regm;
    tym_t tym;
    code *c1,*c2;
    unsigned sz;

    //printf("fixresult87(e = %p, retregs = x%x, *pretregs = x%x)\n", e,retregs,*pretregs);
    //printf("fixresult87(e = %p, retregs = %s, *pretregs = %s)\n", e,regm_str(retregs),regm_str(*pretregs));
    assert(!*pretregs || retregs);
    c1 = CNIL;
    c2 = CNIL;
    tym = tybasic(e->Ety);
    sz = tysize[tym];
    //printf("tym = x%x, sz = %d\n", tym, sz);

    if (*pretregs & mST01)
        return fixresult_complex87(e, retregs, pretregs);

    /* if retregs needs to be transferred into the 8087 */
    if (*pretregs & mST0 && retregs & (mBP | ALLREGS))
    {
        assert(sz <= DOUBLESIZE);
        if (!I16)
        {

            if (*pretregs & mPSW)
            {   // Set flags
                regm_t r = retregs | mPSW;
                c1 = fixresult(e,retregs,&r);
            }
            c2 = push87();
            if (sz == REGSIZE || (I64 && sz == 4))
            {
                unsigned reg = findreg(retregs);
                c2 = genfltreg(c2,0x89,reg,0);          // MOV fltreg,reg
                genfltreg(c2,0xD9,0,0);                 // FLD float ptr fltreg
            }
            else
            {   unsigned msreg,lsreg;

                msreg = findregmsw(retregs);
                lsreg = findreglsw(retregs);
                c2 = genfltreg(c2,0x89,lsreg,0);        // MOV fltreg,lsreg
                genfltreg(c2,0x89,msreg,4);             // MOV fltreg+4,msreg
                genfltreg(c2,0xDD,0,0);                 // FLD double ptr fltreg
            }
        }
        else
        {
            regm = (sz == FLOATSIZE) ? FLOATREGS : DOUBLEREGS;
            regm |= *pretregs & mPSW;
            c1 = fixresult(e,retregs,&regm);
            regm = 0;           // don't worry about result from CLIBxxx
            c2 = callclib(e,
                    ((sz == FLOATSIZE) ? CLIBfltto87 : CLIBdblto87),
                    &regm,0);
        }
    }
    else if (*pretregs & (mBP | ALLREGS) && retregs & mST0)
    {   unsigned mf;
        unsigned reg;

        assert(sz <= DOUBLESIZE);
        mf = (sz == FLOATSIZE) ? MFfloat : MFdouble;
        if (*pretregs & mPSW && !(retregs & mPSW))
                c1 = genftst(c1,e,0);
        /* FSTP floatreg        */
        pop87();
        c1 = genfltreg(c1,ESC(mf,1),3,0);
        genfwait(c1);
        c2 = allocreg(pretregs,&reg,(sz == FLOATSIZE) ? TYfloat : TYdouble);
        if (sz == FLOATSIZE)
        {
            if (!I16)
                c2 = genfltreg(c2,0x8B,reg,0);
            else
            {   c2 = genfltreg(c2,0x8B,reg,REGSIZE);
                genfltreg(c2,0x8B,findreglsw(*pretregs),0);
            }
        }
        else
        {   assert(sz == DOUBLESIZE);
            if (I16)
            {   c2 = genfltreg(c2,0x8B,AX,6);
                genfltreg(c2,0x8B,BX,4);
                genfltreg(c2,0x8B,CX,2);
                genfltreg(c2,0x8B,DX,0);
            }
            else if (I32)
            {   c2 = genfltreg(c2,0x8B,reg,REGSIZE);
                genfltreg(c2,0x8B,findreglsw(*pretregs),0);
            }
            else // I64
            {
                c2 = genfltreg(c2,0x8B,reg,0);
                code_orrex(c2, REX_W);
            }
        }
    }
    else if (*pretregs == 0 && retregs == mST0)
    {
        c1 = genf2(c1,0xDD,modregrm(3,3,0));    // FPOP
        pop87();
    }
    else
    {   if (*pretregs & mPSW)
        {   if (!(retregs & mPSW))
            {   assert(retregs & mST0);
                c1 = genftst(c1,e,!(*pretregs & (mST0 | XMMREGS))); // FTST
            }
        }
        if (*pretregs & mST0 && retregs & XMMREGS)
        {
            assert(sz <= DOUBLESIZE);
            unsigned mf = (sz == FLOATSIZE) ? MFfloat : MFdouble;
            // MOVD floatreg,XMM?
            unsigned reg = findreg(retregs);
            c1 = genfltreg(c1,xmmstore(tym),reg - XMM0,0);
            c2 = push87();
            c2 = genfltreg(c2,ESC(mf,1),0,0);                 // FLD float/double ptr fltreg
        }
        else if (retregs & mST0 && *pretregs & XMMREGS)
        {
            assert(sz <= DOUBLESIZE);
            unsigned mf = (sz == FLOATSIZE) ? MFfloat : MFdouble;
            // FSTP floatreg
            pop87();
            c1 = genfltreg(c1,ESC(mf,1),3,0);
            genfwait(c1);
            // MOVD XMM?,floatreg
            unsigned reg;
            c2 = allocreg(pretregs,&reg,(sz == FLOATSIZE) ? TYfloat : TYdouble);
            c2 = genfltreg(c2,xmmload(tym),reg -XMM0,0);
        }
        else
            assert(!(*pretregs & mST0) || (retregs & mST0));
    }
    if (*pretregs & mST0)
        note87(e,0,0);
    return cat(c1,c2);
}

/********************************
 * Generate in-line 8087 code for the following operators:
 *      add
 *      min
 *      mul
 *      div
 *      cmp
 */

// Reverse the order that the op is done in
static const char oprev[9] = { -1,0,1,2,3,5,4,7,6 };

code *orth87(elem *e,regm_t *pretregs)
{
    unsigned op;
    code *c1,*c2,*c3,*c4;
    code *cx;
    regm_t retregs;
    regm_t resregm;
    elem *e1;
    elem *e2;
    int e2oper;
    int eoper;
    unsigned sz2;
    int clib = CLIBMAX;         // initialize to invalid value
    int reverse = 0;

    //printf("orth87(+e = %p, *pretregs = %s)\n", e, regm_str(*pretregs));
#if 1   // we could be evaluating / for side effects only
    assert(*pretregs != 0);
#endif
    retregs = mST0;
    resregm = mST0;

    e1 = e->E1;
    e2 = e->E2;
    c3 = CNIL;
    c4 = CNIL;
    sz2 = tysize(e1->Ety);
    if (tycomplex(e1->Ety))
        sz2 /= 2;

    eoper = e->Eoper;
    if (eoper == OPmul && e2->Eoper == OPconst && el_toldouble(e->E2) == 2.0L)
    {
        // Perform "mul 2.0" as fadd ST(0), ST
        c1 = codelem(e1,&retregs,FALSE);
        c1 = genf2(c1, 0xDC, 0xC0);             // fadd ST(0), ST;
        c2 = fixresult87(e,mST0,pretregs);      // result is in ST(0).
        freenode(e2);
        return cat(c1,c2);
    }

    if (OTrel(eoper))
        eoper = OPeqeq;
    #define X(op, ty1, ty2)     (((op) << 16) + (ty1) * 256 + (ty2))
    switch (X(eoper, tybasic(e1->Ety), tybasic(e2->Ety)))
    {
        case X(OPadd, TYfloat, TYfloat):
        case X(OPadd, TYdouble, TYdouble):
        case X(OPadd, TYdouble_alias, TYdouble_alias):
        case X(OPadd, TYldouble, TYldouble):
        case X(OPadd, TYldouble, TYdouble):
        case X(OPadd, TYdouble, TYldouble):
        case X(OPadd, TYifloat, TYifloat):
        case X(OPadd, TYidouble, TYidouble):
        case X(OPadd, TYildouble, TYildouble):
            op = 0;                             // FADDP
            break;

        case X(OPmin, TYfloat, TYfloat):
        case X(OPmin, TYdouble, TYdouble):
        case X(OPmin, TYdouble_alias, TYdouble_alias):
        case X(OPmin, TYldouble, TYldouble):
        case X(OPmin, TYldouble, TYdouble):
        case X(OPmin, TYdouble, TYldouble):
        case X(OPmin, TYifloat, TYifloat):
        case X(OPmin, TYidouble, TYidouble):
        case X(OPmin, TYildouble, TYildouble):
            op = 4;                             // FSUBP
            break;

        case X(OPmul, TYfloat, TYfloat):
        case X(OPmul, TYdouble, TYdouble):
        case X(OPmul, TYdouble_alias, TYdouble_alias):
        case X(OPmul, TYldouble, TYldouble):
        case X(OPmul, TYldouble, TYdouble):
        case X(OPmul, TYdouble, TYldouble):
        case X(OPmul, TYifloat, TYifloat):
        case X(OPmul, TYidouble, TYidouble):
        case X(OPmul, TYildouble, TYildouble):
        case X(OPmul, TYfloat, TYifloat):
        case X(OPmul, TYdouble, TYidouble):
        case X(OPmul, TYldouble, TYildouble):
        case X(OPmul, TYifloat, TYfloat):
        case X(OPmul, TYidouble, TYdouble):
        case X(OPmul, TYildouble, TYldouble):
            op = 1;                             // FMULP
            break;

        case X(OPdiv, TYfloat, TYfloat):
        case X(OPdiv, TYdouble, TYdouble):
        case X(OPdiv, TYdouble_alias, TYdouble_alias):
        case X(OPdiv, TYldouble, TYldouble):
        case X(OPdiv, TYldouble, TYdouble):
        case X(OPdiv, TYdouble, TYldouble):
        case X(OPdiv, TYifloat, TYifloat):
        case X(OPdiv, TYidouble, TYidouble):
        case X(OPdiv, TYildouble, TYildouble):
            op = 6;                             // FDIVP
            break;

        case X(OPmod, TYfloat, TYfloat):
        case X(OPmod, TYdouble, TYdouble):
        case X(OPmod, TYdouble_alias, TYdouble_alias):
        case X(OPmod, TYldouble, TYldouble):
        case X(OPmod, TYfloat, TYifloat):
        case X(OPmod, TYdouble, TYidouble):
        case X(OPmod, TYldouble, TYildouble):
        case X(OPmod, TYifloat, TYifloat):
        case X(OPmod, TYidouble, TYidouble):
        case X(OPmod, TYildouble, TYildouble):
        case X(OPmod, TYifloat, TYfloat):
        case X(OPmod, TYidouble, TYdouble):
        case X(OPmod, TYildouble, TYldouble):
            op = (unsigned) -1;
            break;

        case X(OPeqeq, TYfloat, TYfloat):
        case X(OPeqeq, TYdouble, TYdouble):
        case X(OPeqeq, TYdouble_alias, TYdouble_alias):
        case X(OPeqeq, TYldouble, TYldouble):
        case X(OPeqeq, TYifloat, TYifloat):
        case X(OPeqeq, TYidouble, TYidouble):
        case X(OPeqeq, TYildouble, TYildouble):
            assert(OTrel(e->Eoper));
            assert((*pretregs & mST0) == 0);
            c1 = codelem(e1,&retregs,FALSE);
            note87(e1,0,0);
            resregm = mPSW;

            if (rel_exception(e->Eoper) || config.flags4 & CFG4fastfloat)
            {
                if (cnst(e2) && !boolres(e2))
                {
                    if (NOSAHF)
                    {
                        c1 = cat(c1,push87());
                        c1 = gen2(c1,0xD9,0xEE);            // FLDZ
                        gen2(c1,0xDF,0xF1);                 // FCOMIP ST1
                        pop87();
                    }
                    else
                    {   c1 = genf2(c1,0xD9,0xE4);           // FTST
                        c1 = cg87_87topsw(c1);
                    }
                    c2 = genf2(NULL,0xDD,modregrm(3,3,0));      // FPOP
                    pop87();
                }
                else if (NOSAHF)
                {
                    note87(e1,0,0);
                    c2 = load87(e2,0,&retregs,e1,-1);
                    c2 = cat(c2,makesure87(e1,0,1,0));
                    resregm = 0;
                    //c2 = genf2(c2,0xD9,0xC8 + 1);       // FXCH ST1
                    c2 = gen2(c2,0xDF,0xF1);            // FCOMIP ST1
                    pop87();
                    genf2(c2,0xDD,modregrm(3,3,0));     // FPOP
                    pop87();
                }
                else
                {
                    c2 = load87(e2, 0, pretregs, e1, 3);        // FCOMPP
                }
            }
            else
            {
                if (cnst(e2) && !boolres(e2) &&
                    config.target_cpu < TARGET_80386)
                {
                    regm_t regm = 0;

                    c2 = callclib(e,CLIBftest0,&regm,0);
                    pop87();
                }
                else
                {
                    note87(e1,0,0);
                    c2 = load87(e2,0,&retregs,e1,-1);
                    c2 = cat(c2,makesure87(e1,0,1,0));
                    resregm = 0;
                    if (NOSAHF)
                    {
                        c3 = gen2(CNIL,0xDF,0xE9);              // FUCOMIP ST1
                        pop87();
                        genf2(c3,0xDD,modregrm(3,3,0));         // FPOP
                        pop87();
                    }
                    else if (config.target_cpu >= TARGET_80386)
                    {
                        c3 = gen2(CNIL,0xDA,0xE9);      // FUCOMPP
                        c3 = cg87_87topsw(c3);
                        pop87();
                        pop87();
                    }
                    else
                        // Call a function instead so that exceptions
                        // are not generated.
                        c3 = callclib(e,CLIBfcompp,&resregm,0);
                }
            }

            freenode(e2);
            return cat4(c1,c2,c3,c4);

        case X(OPadd, TYcfloat, TYcfloat):
        case X(OPadd, TYcdouble, TYcdouble):
        case X(OPadd, TYcldouble, TYcldouble):
        case X(OPadd, TYcfloat, TYfloat):
        case X(OPadd, TYcdouble, TYdouble):
        case X(OPadd, TYcldouble, TYldouble):
        case X(OPadd, TYfloat, TYcfloat):
        case X(OPadd, TYdouble, TYcdouble):
        case X(OPadd, TYldouble, TYcldouble):
            goto Lcomplex;

        case X(OPadd, TYifloat, TYcfloat):
        case X(OPadd, TYidouble, TYcdouble):
        case X(OPadd, TYildouble, TYcldouble):
            goto Lcomplex2;

        case X(OPmin, TYcfloat, TYcfloat):
        case X(OPmin, TYcdouble, TYcdouble):
        case X(OPmin, TYcldouble, TYcldouble):
        case X(OPmin, TYcfloat, TYfloat):
        case X(OPmin, TYcdouble, TYdouble):
        case X(OPmin, TYcldouble, TYldouble):
        case X(OPmin, TYfloat, TYcfloat):
        case X(OPmin, TYdouble, TYcdouble):
        case X(OPmin, TYldouble, TYcldouble):
            goto Lcomplex;

        case X(OPmin, TYifloat, TYcfloat):
        case X(OPmin, TYidouble, TYcdouble):
        case X(OPmin, TYildouble, TYcldouble):
            goto Lcomplex2;

        case X(OPmul, TYcfloat, TYcfloat):
        case X(OPmul, TYcdouble, TYcdouble):
        case X(OPmul, TYcldouble, TYcldouble):
            clib = CLIBcmul;
            goto Lcomplex;

        case X(OPdiv, TYcfloat, TYcfloat):
        case X(OPdiv, TYcdouble, TYcdouble):
        case X(OPdiv, TYcldouble, TYcldouble):
        case X(OPdiv, TYfloat, TYcfloat):
        case X(OPdiv, TYdouble, TYcdouble):
        case X(OPdiv, TYldouble, TYcldouble):
        case X(OPdiv, TYifloat, TYcfloat):
        case X(OPdiv, TYidouble, TYcdouble):
        case X(OPdiv, TYildouble, TYcldouble):
            clib = CLIBcdiv;
            goto Lcomplex;

        case X(OPdiv, TYifloat,   TYfloat):
        case X(OPdiv, TYidouble,  TYdouble):
        case X(OPdiv, TYildouble, TYldouble):
            op = 6;                             // FDIVP
            break;

        Lcomplex:
            c1 = loadComplex(e1);
            c2 = loadComplex(e2);
            c3 = makesure87(e1, sz2, 2, 0);
            c3 = cat(c3,makesure87(e1, 0, 3, 0));
            retregs = mST01;
            if (eoper == OPadd)
            {
                c4 = genf2(NULL, 0xDE, 0xC0+2); // FADDP ST(2),ST
                genf2(c4, 0xDE, 0xC0+2);        // FADDP ST(2),ST
                pop87();
                pop87();
            }
            else if (eoper == OPmin)
            {
                c4 = genf2(NULL, 0xDE, 0xE8+2); // FSUBP ST(2),ST
                genf2(c4, 0xDE, 0xE8+2);        // FSUBP ST(2),ST
                pop87();
                pop87();
            }
            else
                c4 = callclib(e, clib, &retregs, 0);
            c4 = cat(c4, fixresult_complex87(e, retregs, pretregs));
            return cat4(c1,c2,c3,c4);

        Lcomplex2:
            retregs = mST0;
            c1 = codelem(e1, &retregs, FALSE);
            note87(e1, 0, 0);
            c2 = loadComplex(e2);
            c3 = makesure87(e1, 0, 2, 0);
            retregs = mST01;
            if (eoper == OPadd)
            {
                c4 = genf2(NULL, 0xDE, 0xC0+2); // FADDP ST(2),ST
            }
            else if (eoper == OPmin)
            {
                c4 = genf2(NULL, 0xDE, 0xE8+2); // FSUBP ST(2),ST
                c4 = genf2(c4, 0xD9, 0xE0);     // FCHS
            }
            else
                assert(0);
            pop87();
            c4 = genf2(c4, 0xD9, 0xC8 + 1);     // FXCH ST(1)
            c4 = cat(c4, fixresult_complex87(e, retregs, pretregs));
            return cat4(c1,c2,c3,c4);

        case X(OPeqeq, TYcfloat, TYcfloat):
        case X(OPeqeq, TYcdouble, TYcdouble):
        case X(OPeqeq, TYcldouble, TYcldouble):
        case X(OPeqeq, TYcfloat, TYifloat):
        case X(OPeqeq, TYcdouble, TYidouble):
        case X(OPeqeq, TYcldouble, TYildouble):
        case X(OPeqeq, TYcfloat, TYfloat):
        case X(OPeqeq, TYcdouble, TYdouble):
        case X(OPeqeq, TYcldouble, TYldouble):
        case X(OPeqeq, TYifloat, TYcfloat):
        case X(OPeqeq, TYidouble, TYcdouble):
        case X(OPeqeq, TYildouble, TYcldouble):
        case X(OPeqeq, TYfloat, TYcfloat):
        case X(OPeqeq, TYdouble, TYcdouble):
        case X(OPeqeq, TYldouble, TYcldouble):
        case X(OPeqeq, TYfloat, TYifloat):
        case X(OPeqeq, TYdouble, TYidouble):
        case X(OPeqeq, TYldouble, TYildouble):
        case X(OPeqeq, TYifloat, TYfloat):
        case X(OPeqeq, TYidouble, TYdouble):
        case X(OPeqeq, TYildouble, TYldouble):
            c1 = loadComplex(e1);
            c2 = loadComplex(e2);
            c3 = makesure87(e1, sz2, 2, 0);
            c3 = cat(c3,makesure87(e1, 0, 3, 0));
            retregs = 0;
            c4 = callclib(e, CLIBccmp, &retregs, 0);
            return cat4(c1,c2,c3,c4);


        case X(OPadd, TYfloat, TYifloat):
        case X(OPadd, TYdouble, TYidouble):
        case X(OPadd, TYldouble, TYildouble):
        case X(OPadd, TYifloat, TYfloat):
        case X(OPadd, TYidouble, TYdouble):
        case X(OPadd, TYildouble, TYldouble):

        case X(OPmin, TYfloat, TYifloat):
        case X(OPmin, TYdouble, TYidouble):
        case X(OPmin, TYldouble, TYildouble):
        case X(OPmin, TYifloat, TYfloat):
        case X(OPmin, TYidouble, TYdouble):
        case X(OPmin, TYildouble, TYldouble):
            retregs = mST0;
            c1 = codelem(e1, &retregs, FALSE);
            note87(e1, 0, 0);
            c2 = codelem(e2, &retregs, FALSE);
            c3 = makesure87(e1, 0, 1, 0);
            if (eoper == OPmin)
                c3 = genf2(c3, 0xD9, 0xE0);     // FCHS
            if (tyimaginary(e1->Ety))
                c3 = genf2(c3, 0xD9, 0xC8 + 1); // FXCH ST(1)
            retregs = mST01;
            c4 = fixresult_complex87(e, retregs, pretregs);
            return cat4(c1,c2,c3,c4);

        case X(OPadd, TYcfloat, TYifloat):
        case X(OPadd, TYcdouble, TYidouble):
        case X(OPadd, TYcldouble, TYildouble):
            op = 0;
            goto Lci;

        case X(OPmin, TYcfloat, TYifloat):
        case X(OPmin, TYcdouble, TYidouble):
        case X(OPmin, TYcldouble, TYildouble):
            op = 4;
            goto Lci;

        Lci:
            c1 = loadComplex(e1);
            retregs = mST0;
            c2 = load87(e2,sz2,&retregs,e1,op);
            freenode(e2);
            retregs = mST01;
            c3 = makesure87(e1,0,1,0);
            c4 = fixresult_complex87(e, retregs, pretregs);
            return cat4(c1,c2,c3,c4);

        case X(OPmul, TYcfloat, TYfloat):
        case X(OPmul, TYcdouble, TYdouble):
        case X(OPmul, TYcldouble, TYldouble):
            c1 = loadComplex(e1);
            goto Lcm1;

        case X(OPmul, TYcfloat, TYifloat):
        case X(OPmul, TYcdouble, TYidouble):
        case X(OPmul, TYcldouble, TYildouble):
            c1 = loadComplex(e1);
            c1 = genf2(c1, 0xD9, 0xE0);         // FCHS
            genf2(c1,0xD9,0xC8 + 1);            // FXCH ST(1)
            if (elemisone(e2))
            {
                freenode(e2);
                c2 = NULL;
                c3 = NULL;
                goto Lcd4;
            }
            goto Lcm1;

        Lcm1:
            retregs = mST0;
            c2 = codelem(e2, &retregs, FALSE);
            c3 = makesure87(e1, sz2, 1, 0);
            c3 = cat(c3,makesure87(e1, 0, 2, 0));
            goto Lcm2;

        case X(OPmul, TYfloat, TYcfloat):
        case X(OPmul, TYdouble, TYcdouble):
        case X(OPmul, TYldouble, TYcldouble):
            retregs = mST0;
            c1 = codelem(e1, &retregs, FALSE);
            note87(e1, 0, 0);
            c2 = loadComplex(e2);
            c3 = makesure87(e1, 0, 2, 0);
            c3 = genf2(c3,0xD9,0xC8 + 1);       // FXCH ST(1)
            genf2(c3,0xD9,0xC8 + 2);            // FXCH ST(2)
            goto Lcm2;

        case X(OPmul, TYifloat, TYcfloat):
        case X(OPmul, TYidouble, TYcdouble):
        case X(OPmul, TYildouble, TYcldouble):
            retregs = mST0;
            c1 = codelem(e1, &retregs, FALSE);
            note87(e1, 0, 0);
            c2 = loadComplex(e2);
            c3 = makesure87(e1, 0, 2, 0);
            c3 = genf2(c3, 0xD9, 0xE0);         // FCHS
            genf2(c3,0xD9,0xC8 + 2);            // FXCH ST(2)
            goto Lcm2;

        Lcm2:
            c3 = genf2(c3,0xDC,0xC8 + 2);       // FMUL ST(2), ST
            genf2(c3,0xDE,0xC8 + 1);            // FMULP ST(1), ST
            goto Lcd3;

        case X(OPdiv, TYcfloat, TYfloat):
        case X(OPdiv, TYcdouble, TYdouble):
        case X(OPdiv, TYcldouble, TYldouble):
            c1 = loadComplex(e1);
            retregs = mST0;
            c2 = codelem(e2, &retregs, FALSE);
            c3 = makesure87(e1, sz2, 1, 0);
            c3 = cat(c3,makesure87(e1, 0, 2, 0));
            goto Lcd1;

        case X(OPdiv, TYcfloat, TYifloat):
        case X(OPdiv, TYcdouble, TYidouble):
        case X(OPdiv, TYcldouble, TYildouble):
            c1 = loadComplex(e1);
            c1 = genf2(c1,0xD9,0xC8 + 1);       // FXCH ST(1)
            xchg87(0, 1);
            genf2(c1, 0xD9, 0xE0);              // FCHS
            retregs = mST0;
            c2 = codelem(e2, &retregs, FALSE);
            c3 = makesure87(e1, 0, 1, 0);
            c3 = cat(c3,makesure87(e1, sz2, 2, 0));
        Lcd1:
            c3 = genf2(c3,0xDC,0xF8 + 2);       // FDIV ST(2), ST
            genf2(c3,0xDE,0xF8 + 1);            // FDIVP ST(1), ST
        Lcd3:
            pop87();
        Lcd4:
            retregs = mST01;
            c4 = fixresult_complex87(e, retregs, pretregs);
            return cat4(c1, c2, c3, c4);

        case X(OPmod, TYcfloat, TYfloat):
        case X(OPmod, TYcdouble, TYdouble):
        case X(OPmod, TYcldouble, TYldouble):
        case X(OPmod, TYcfloat, TYifloat):
        case X(OPmod, TYcdouble, TYidouble):
        case X(OPmod, TYcldouble, TYildouble):
            /*
                        fld     E1.re
                        fld     E1.im
                        fld     E2
                        fxch    ST(1)
                FM1:    fprem
                        fstsw   word ptr sw
                        fwait
                        mov     AH, byte ptr sw+1
                        jp      FM1
                        fxch    ST(2)
                FM2:    fprem
                        fstsw   word ptr sw
                        fwait
                        mov     AH, byte ptr sw+1
                        jp      FM2
                        fstp    ST(1)
                        fxch    ST(1)
             */
            c1 = loadComplex(e1);
            retregs = mST0;
            c2 = codelem(e2, &retregs, FALSE);
            c3 = makesure87(e1, sz2, 1, 0);
            c3 = cat(c3,makesure87(e1, 0, 2, 0));
            c3 = genf2(c3, 0xD9, 0xC8 + 1);             // FXCH ST(1)

            cx = gen2(NULL, 0xD9, 0xF8);                // FPREM
            cx = cg87_87topsw(cx);
            cx = genjmp(cx, JP, FLcode, (block *)cx);   // JP FM1
            cx = genf2(cx, 0xD9, 0xC8 + 2);             // FXCH ST(2)
            c3 = cat(c3,cx);

            cx = gen2(NULL, 0xD9, 0xF8);                // FPREM
            cx = cg87_87topsw(cx);
            cx = genjmp(cx, JP, FLcode, (block *)cx);   // JP FM2
            cx = genf2(cx,0xDD,0xD8 + 1);               // FSTP ST(1)
            cx = genf2(cx, 0xD9, 0xC8 + 1);             // FXCH ST(1)
            c3 = cat(c3,cx);

            goto Lcd3;

        default:
#ifdef DEBUG
            elem_print(e);
#endif
            assert(0);
            break;
    }
    #undef X

    e2oper = e2->Eoper;

    /* Move double-sized operand into the second position if there's a chance
     * it will allow combining a load with an operation (DMD Bugzilla 2905)
     */
    if ( ((tybasic(e1->Ety) == TYdouble)
          && ((e1->Eoper == OPvar) || (e1->Eoper == OPconst))
          && (tybasic(e2->Ety) != TYdouble)) ||
        (e1->Eoper == OPconst) ||
        (e1->Eoper == OPvar &&
         ((e1->Ety & (mTYconst | mTYimmutable) && !OTleaf(e2oper)) ||
          (e2oper == OPd_f &&
            (e2->E1->Eoper == OPs32_d || e2->E1->Eoper == OPs64_d || e2->E1->Eoper == OPs16_d) &&
            e2->E1->E1->Eoper == OPvar
          ) ||
          ((e2oper == OPs32_d || e2oper == OPs64_d || e2oper == OPs16_d) &&
            e2->E1->Eoper == OPvar
          )
         )
        )
       )
    {   // Reverse order of evaluation
        e1 = e->E2;
        e2 = e->E1;
        op = oprev[op + 1];
        reverse ^= 1;
    }

    c1 = codelem(e1,&retregs,FALSE);
    note87(e1,0,0);

    if (config.flags4 & CFG4fdivcall && e->Eoper == OPdiv)
    {
        regm_t retregs = mST0;
        c2 = load87(e2,0,&retregs,e1,-1);
        c2 = cat(c2,makesure87(e1,0,1,0));
        if (op == 7)                    // if reverse divide
            c2 = genf2(c2,0xD9,0xC8 + 1);       // FXCH ST(1)
        c2 = cat(c2,callclib(e,CLIBfdiv87,&retregs,0));
        pop87();
        resregm = mST0;
        freenode(e2);
        c4 = fixresult87(e,resregm,pretregs);
    }
    else if (e->Eoper == OPmod)
    {
        /*
         *              fld     tbyte ptr y
         *              fld     tbyte ptr x             // ST = x, ST1 = y
         *      FM1:    // We don't use fprem1 because for some inexplicable
         *              // reason we get -5 when we do _modulo(15, 10)
         *              fprem                           // ST = ST % ST1
         *              fstsw   word ptr sw
         *              fwait
         *              mov     AH,byte ptr sw+1        // get msb of status word in AH
         *              sahf                            // transfer to flags
         *              jp      FM1                     // continue till ST < ST1
         *              fstp    ST(1)                   // leave remainder on stack
         */
        regm_t retregs = mST0;
        c2 = load87(e2,0,&retregs,e1,-1);
        c2 = cat(c2,makesure87(e1,0,1,0));      // now have x,y on stack; need y,x
        if (!reverse)                           // if not reverse modulo
            c2 = genf2(c2,0xD9,0xC8 + 1);       // FXCH ST(1)

        c3 = gen2(NULL, 0xD9, 0xF8);            // FM1: FPREM
        c3 = cg87_87topsw(c3);
        c3 = genjmp(c3, JP, FLcode, (block *)c3); // JP FM1
        c3 = genf2(c3,0xDD,0xD8 + 1);           // FSTP ST(1)

        pop87();
        resregm = mST0;
        freenode(e2);
        c4 = fixresult87(e,resregm,pretregs);
    }
    else
    {   c2 = load87(e2,0,pretregs,e1,op);
        freenode(e2);
    }
    if (*pretregs & mST0)
        note87(e,0,0);
    //printf("orth87(-e = %p, *pretregs = x%x)\n", e, *pretregs);
    return cat4(c1,c2,c3,c4);
}

/*****************************
 * Load e into ST01.
 */

code *loadComplex(elem *e)
{   int sz;
    regm_t retregs;
    code *c;

    sz = tysize(e->Ety);
    switch (tybasic(e->Ety))
    {
        case TYfloat:
        case TYdouble:
        case TYldouble:
            retregs = mST0;
            c = codelem(e,&retregs,FALSE);
            // Convert to complex with a 0 for the imaginary part
            c = cat(c, push87());
            c = gen2(c,0xD9,0xEE);              // FLDZ
            break;

        case TYifloat:
        case TYidouble:
        case TYildouble:
            // Convert to complex with a 0 for the real part
            c = push87();
            c = gen2(c,0xD9,0xEE);              // FLDZ
            retregs = mST0;
            c = cat(c, codelem(e,&retregs,FALSE));
            break;

        case TYcfloat:
        case TYcdouble:
        case TYcldouble:
            sz /= 2;
            retregs = mST01;
            c = codelem(e,&retregs,FALSE);
            break;

        default:
            assert(0);
    }
    note87(e, 0, 1);
    note87(e, sz, 0);
    return c;
}

/*************************
 * If op == -1, load expression e into ST0.
 * else compute (eleft op e), eleft is in ST0.
 * Must follow same logic as cmporder87();
 */

code *load87(elem *e,unsigned eoffset,regm_t *pretregs,elem *eleft,int op)
{
        code *ccomma,*c,*c2,*cpush;
        code cs;
        regm_t retregs;
        unsigned reg,mf,mf1;
        int opr;
        unsigned char ldop;
        tym_t ty;
        int i;

#if NDPP
        printf("+load87(e=%p, eoffset=%d, *pretregs=%s, eleft=%p, op=%d, stackused = %d)\n",e,eoffset,regm_str(*pretregs),eleft,op,stackused);
#endif
        elem_debug(e);
        ccomma = NULL;
        cpush = NULL;
        if (ADDFWAIT())
            cs.Iflags = CFwait;
        else
            cs.Iflags = 0;
        cs.Irex = 0;
        opr = oprev[op + 1];
        ty = tybasic(e->Ety);
        if ((ty == TYldouble || ty == TYildouble) &&
            op != -1 && e->Eoper != OPd_ld)
            goto Ldefault;
        mf = (ty == TYfloat || ty == TYifloat || ty == TYcfloat) ? MFfloat : MFdouble;
    L5:
        switch (e->Eoper)
        {
            case OPcomma:
                ccomma = docommas(&e);
//              if (op != -1)
//                  ccomma = cat(ccomma,makesure87(eleft,eoffset,0,0));
                goto L5;

            case OPvar:
                notreg(e);
            case OPind:
            L2:
                if (op != -1)
                {
                    if (e->Ecount && e->Ecount != e->Ecomsub &&
                        (i = cse_get(e, 0)) >= 0)
                    {   static unsigned char b2[8] = {0xC0,0xC8,0xD0,0xD8,0xE0,0xE8,0xF0,0xF8};

                        c = genf2(NULL,0xD8,b2[op] + i);        // Fop ST(i)
                    }
                    else
                    {
                        c = getlvalue(&cs,e,0);
                        if (I64)
                            cs.Irex &= ~REX_W;                  // don't use for x87 ops
                        c = cat(c,makesure87(eleft,eoffset,0,0));
                        cs.Iop = ESC(mf,0);
                        cs.Irm |= modregrm(0,op,0);
                        c = gen(c,&cs);
                    }
                }
                else
                {
                    cpush = push87();
                    switch (ty)
                    {
                        case TYfloat:
                        case TYdouble:
                        case TYifloat:
                        case TYidouble:
                        case TYcfloat:
                        case TYcdouble:
                        case TYdouble_alias:
                            c = loadea(e,&cs,ESC(mf,1),0,0,0,0);        // FLD var
                            break;
                        case TYldouble:
                        case TYildouble:
                        case TYcldouble:
                            c = loadea(e,&cs,0xDB,5,0,0,0);             // FLD var
                            break;
                        default:
                            // __debug printf("ty = x%x\n", ty);
                            assert(0);
                            break;
                    }
                    note87(e,0,0);
                }
                break;
            case OPd_f:
            case OPf_d:
            case OPd_ld:
                mf1 = (tybasic(e->E1->Ety) == TYfloat || tybasic(e->E1->Ety) == TYifloat)
                        ? MFfloat : MFdouble;
                if (op != -1 && stackused)
                    note87(eleft,eoffset,0);    // don't trash this value
                if (e->E1->Eoper == OPvar || e->E1->Eoper == OPind)
                {
#if 1
                L4:
                    c = getlvalue(&cs,e->E1,0);
                    cs.Iop = ESC(mf1,0);
                    if (ADDFWAIT())
                        cs.Iflags |= CFwait;
                    if (!I16)
                        cs.Iflags &= ~CFopsize;
                    if (op != -1)
                    {   cs.Irm |= modregrm(0,op,0);
                        c = cat(c,makesure87(eleft,eoffset,0,0));
                    }
                    else
                    {   cs.Iop |= 1;
                        c = cat(c,push87());
                    }
                    c = gen(c,&cs);                     /* FLD / Fop    */
#else
                    c = loadea(e->E1,&cs,ESC(mf1,1),0,0,0,0); /* FLD e->E1 */
#endif
                    /* Variable cannot be put into a register anymore   */
                    if (e->E1->Eoper == OPvar)
                        notreg(e->E1);
                    freenode(e->E1);
                }
                else
                {
                    retregs = mST0;
                    c = codelem(e->E1,&retregs,FALSE);
                    if (op != -1)
                    {   c = cat(c,makesure87(eleft,eoffset,1,0));
                        c = genf2(c,0xDE,modregrm(3,opr,1)); // FopRP
                        pop87();
                    }
                }
                break;

            case OPs64_d:
                if (e->E1->Eoper == OPvar ||
                    (e->E1->Eoper == OPind && e->E1->Ecount == 0))
                {
                    c = getlvalue(&cs,e->E1,0);
                    cs.Iop = 0xDF;
                    if (ADDFWAIT())
                        cs.Iflags |= CFwait;
                    if (!I16)
                        cs.Iflags &= ~CFopsize;
                    c = cat(c,push87());
                    cs.Irm |= modregrm(0,5,0);
                    c = gen(c,&cs);                     // FILD m64
                    // Variable cannot be put into a register anymore
                    if (e->E1->Eoper == OPvar)
                        notreg(e->E1);
                    freenode(e->E1);
                }
                else if (I64)
                {
                    retregs = ALLREGS;
                    c = codelem(e->E1,&retregs,FALSE);
                    reg = findreg(retregs);
                    c = genfltreg(c,0x89,reg,0);        // MOV floatreg,reg
                    code_orrex(c, REX_W);
                    c = cat(c,push87());
                    c = genfltreg(c,0xDF,5,0);          // FILD long long ptr floatreg
                }
                else
                {
                    retregs = ALLREGS;
                    c = codelem(e->E1,&retregs,FALSE);
                    reg = findreglsw(retregs);
                    c = genfltreg(c,0x89,reg,0);        // MOV floatreg,reglsw
                    reg = findregmsw(retregs);
                    c = genfltreg(c,0x89,reg,4);        // MOV floatreg+4,regmsw
                    c = cat(c,push87());
                    c = genfltreg(c,0xDF,5,0);          // FILD long long ptr floatreg
                }
                if (op != -1)
                {   c = cat(c,makesure87(eleft,eoffset,1,0));
                    c = genf2(c,0xDE,modregrm(3,opr,1)); // FopRP
                    pop87();
                }
                break;

            case OPconst:
                ldop = loadconst(e, 0);
                if (ldop)
                {
                    cpush = push87();
                    c = genf2(NULL,0xD9,ldop);          // FLDx
                    if (op != -1)
                    {   genf2(c,0xDE,modregrm(3,opr,1));        // FopRP
                        pop87();
                    }
                }
                else
                {
                    assert(0);
                }
                break;

            case OPu16_d:
            {
                /* This opcode should never be generated        */
                /* (probably shouldn't be for 16 bit code too)  */
                assert(!I32);

                if (op != -1)
                    note87(eleft,eoffset,0);    // don't trash this value
                retregs = ALLREGS & mLSW;
                c = codelem(e->E1,&retregs,FALSE);
                c = regwithvalue(c,ALLREGS & mMSW,0,&reg,0);  // 0-extend
                retregs |= mask[reg];
                mf1 = MFlong;
                goto L3;
            }
            case OPs16_d:       mf1 = MFword;   goto L6;
            case OPs32_d:       mf1 = MFlong;   goto L6;
            L6:
                if (op != -1)
                    note87(eleft,eoffset,0);    // don't trash this value
                if (e->E1->Eoper == OPvar ||
                    (e->E1->Eoper == OPind && e->E1->Ecount == 0))
                {
                    goto L4;
                }
                else
                {
                    retregs = ALLREGS;
                    c = codelem(e->E1,&retregs,FALSE);
                L3:
                    if (I16 && e->Eoper != OPs16_d)
                    {
                        /* MOV floatreg+2,reg   */
                        reg = findregmsw(retregs);
                        c = genfltreg(c,0x89,reg,REGSIZE);
                        retregs &= mLSW;
                    }
                    reg = findreg(retregs);
                    c = genfltreg(c,0x89,reg,0);        /* MOV floatreg,reg */
                    if (op != -1)
                    {   c = cat(c,makesure87(eleft,eoffset,0,0));
                        genfltreg(c,ESC(mf1,0),op,0);   /* Fop floatreg */
                    }
                    else
                    {
                        /* FLD long ptr floatreg        */
                        c = cat(c,push87());
                        c = genfltreg(c,ESC(mf1,1),0,0);
                    }
                }
                break;
            default:
            Ldefault:
                retregs = mST0;
#if 1           /* Do this instead of codelem() to avoid the freenode(e).
                   We also lose CSE capability  */
                if (e->Eoper == OPconst)
                {
                    c = load87(e, 0, &retregs, NULL, -1);
                }
                else
                    c = (*cdxxx[e->Eoper])(e,&retregs);
#else
                c = codelem(e,&retregs,FALSE);
#endif
                if (op != -1)
                {
                    c = cat(c,makesure87(eleft,eoffset,1,(op == 0 || op == 1)));
                    pop87();
                    if (op == 4 || op == 6)     // sub or div
                    {   code *cl;

                        cl = code_last(c);
                        if (cl && cl->Iop == 0xD9 && cl->Irm == 0xC9)   // FXCH ST(1)
                        {   cl->Iop = NOP;
                            opr = op;           // reverse operands
                        }
                    }
                    c = genf2(c,0xDE,modregrm(3,opr,1));        // FopRP
                }
                break;
        }
        if (op == 3)                    // FCOMP
        {   pop87();                    // extra pop was done
            cg87_87topsw(c);
        }
        c2 = fixresult87(e,((op == 3) ? mPSW : mST0),pretregs);
#if NDPP
        printf("-load87(e=%p, eoffset=%d, *pretregs=%s, eleft=%p, op=%d, stackused = %d)\n",e,eoffset,regm_str(*pretregs),eleft,op,stackused);
#endif
        return cat4(ccomma,cpush,c,c2);
}

/********************************
 * Determine if a compare is to be done forwards (return 0)
 * or backwards (return 1).
 * Must follow same logic as load87().
 */

int cmporder87(elem *e)
{
    //printf("cmporder87(%p)\n",e);
L1:
        switch (e->Eoper)
        {
            case OPcomma:
                e = e->E2;
                goto L1;

            case OPd_f:
            case OPf_d:
            case OPd_ld:
                if (e->E1->Eoper == OPvar || e->E1->Eoper == OPind)
                    goto ret0;
                else
                    goto ret1;

            case OPconst:
                if (loadconst(e, 0) || tybasic(e->Ety) == TYldouble
                                    || tybasic(e->Ety) == TYildouble)
{
//printf("ret 1, loadconst(e) = %d\n", loadconst(e));
                    goto ret1;
}
                goto ret0;

            case OPvar:
            case OPind:
                if (tybasic(e->Ety) == TYldouble ||
                    tybasic(e->Ety) == TYildouble)
                    goto ret1;
            case OPu16_d:
            case OPs16_d:
            case OPs32_d:
                goto ret0;

            case OPs64_d:
                goto ret1;

            default:
                goto ret1;
        }

ret1:   return 1;
ret0:   return 0;
}

/*******************************
 * Perform an assignment to a long double/double/float.
 */

code *eq87(elem *e,regm_t *pretregs)
{
        regm_t retregs;
        code *c1,*c2;
        code cs;
        unsigned op1;
        unsigned op2;
        tym_t ty1;

        //printf("+eq87(e = %p, *pretregs = %s)\n", e, regm_str(*pretregs));
        assert(e->Eoper == OPeq);
        retregs = mST0 | (*pretregs & mPSW);
        c1 = codelem(e->E2,&retregs,FALSE);
        ty1 = tybasic(e->E1->Ety);
        switch (ty1)
        {   case TYdouble_alias:
            case TYidouble:
            case TYdouble:      op1 = ESC(MFdouble,1);  op2 = 3; break;
            case TYifloat:
            case TYfloat:       op1 = ESC(MFfloat,1);   op2 = 3; break;
            case TYildouble:
            case TYldouble:     op1 = 0xDB;             op2 = 7; break;
            default:
                assert(0);
        }
        if (*pretregs & (mST0 | ALLREGS | mBP | XMMREGS)) // if want result on stack too
        {
            if (ty1 == TYldouble || ty1 == TYildouble)
            {
                c1 = cat(c1,push87());
                c1 = genf2(c1,0xD9,0xC0);       // FLD ST(0)
                pop87();
            }
            else
                op2 = 2;                        // FST e->E1
        }
        else
        {                                       // FSTP e->E1
            pop87();
        }
#if 0
        // Doesn't work if ST(0) gets saved to the stack by getlvalue()
        c2 = loadea(e->E1,&cs,op1,op2,0,0,0);
#else
        cs.Irex = 0;
        cs.Iflags = 0;
        cs.Iop = op1;
        if (*pretregs & (mST0 | ALLREGS | mBP | XMMREGS)) // if want result on stack too
        {   // Make sure it's still there
            elem *e2 = e->E2;
            while (e2->Eoper == OPcomma)
                e2 = e2->E2;
            note87(e2,0,0);
            c2 = getlvalue(&cs, e->E1, 0);
            c2 = cat(c2,makesure87(e2,0,0,1));
        }
        else
        {
            c2 = getlvalue(&cs, e->E1, 0);
        }
        cs.Irm |= modregrm(0,op2,0);            // OR in reg field
        if (I32)
            cs.Iflags &= ~CFopsize;
        else if (ADDFWAIT())
            cs.Iflags |= CFwait;
        else if (I64)
            cs.Irex &= ~REX_W;
        c2 = gen(c2, &cs);
#if LNGDBLSIZE == 12
        if (tysize[TYldouble] == 12)
        {
        /* This deals with the fact that 10 byte reals really
         * occupy 12 bytes by zeroing the extra 2 bytes.
         */
        if (op1 == 0xDB)
        {
            cs.Iop = 0xC7;                      // MOV EA+10,0
            NEWREG(cs.Irm, 0);
            cs.IEV1.sp.Voffset += 10;
            cs.IFL2 = FLconst;
            cs.IEV2.Vint = 0;
            cs.Iflags |= CFopsize;
            c2 = gen(c2, &cs);
        }
        }
#endif
        if (tysize[TYldouble] == 16)
        {
        /* This deals with the fact that 10 byte reals really
         * occupy 16 bytes by zeroing the extra 6 bytes.
         */
        if (op1 == 0xDB)
        {
            cs.Irex &= ~REX_W;
            cs.Iop = 0xC7;                      // MOV EA+10,0
            NEWREG(cs.Irm, 0);
            cs.IEV1.sp.Voffset += 10;
            cs.IFL2 = FLconst;
            cs.IEV2.Vint = 0;
            cs.Iflags |= CFopsize;
            c2 = gen(c2, &cs);

            cs.IEV1.sp.Voffset += 2;
            cs.Iflags &= ~CFopsize;
            c2 = gen(c2, &cs);
        }
        }
#endif
        c2 = genfwait(c2);
        freenode(e->E1);
        c1 = cat3(c1,c2,fixresult87(e,mST0 | mPSW,pretregs));
        return c1;
}

/*******************************
 * Perform an assignment to a long double/double/float.
 */

code *complex_eq87(elem *e,regm_t *pretregs)
{
        regm_t retregs;
        code *c1,*c2;
        code cs;
        unsigned op1;
        unsigned op2;
        unsigned sz;
        tym_t ty1;
        int fxch = 0;

        //printf("complex_eq87(e = %p, *pretregs = %s)\n", e, regm_str(*pretregs));
        assert(e->Eoper == OPeq);
        cs.Iflags = ADDFWAIT() ? CFwait : 0;
        cs.Irex = 0;
        retregs = mST01 | (*pretregs & mPSW);
        c1 = codelem(e->E2,&retregs,FALSE);
        ty1 = tybasic(e->E1->Ety);
        switch (ty1)
        {
            case TYcdouble:     op1 = ESC(MFdouble,1);  op2 = 3; break;
            case TYcfloat:      op1 = ESC(MFfloat,1);   op2 = 3; break;
            case TYcldouble:    op1 = 0xDB;             op2 = 7; break;
            default:
                assert(0);
        }
        if (*pretregs & (mST01 | mXMM0 | mXMM1))  // if want result on stack too
        {
            if (ty1 == TYcldouble)
            {
                c1 = cat(c1,push87());
                c1 = cat(c1,push87());
                c1 = genf2(c1,0xD9,0xC0 + 1);   // FLD ST(1)
                genf2(c1,0xD9,0xC0 + 1);        // FLD ST(1)
                pop87();
                pop87();
            }
            else
            {   op2 = 2;                        // FST e->E1
                fxch = 1;
            }
        }
        else
        {                                       // FSTP e->E1
            pop87();
            pop87();
        }
        sz = tysize(ty1) / 2;
        if (*pretregs & (mST01 | mXMM0 | mXMM1))
        {
            cs.Iflags = 0;
            cs.Irex = 0;
            cs.Iop = op1;
            c2 = getlvalue(&cs, e->E1, 0);
            cs.IEVoffset1 += sz;
            cs.Irm |= modregrm(0, op2, 0);
            c2 = cat(c2, makesure87(e->E2, sz, 0, 0));
            c2 = gen(c2, &cs);
            c2 = genfwait(c2);
            c2 = cat(c2, makesure87(e->E2,  0, 1, 0));
        }
        else
        {
            c2 = loadea(e->E1,&cs,op1,op2,sz,0,0);
            c2 = genfwait(c2);
        }
        if (fxch)
            c2 = genf2(c2,0xD9,0xC8 + 1);       // FXCH ST(1)
        cs.IEVoffset1 -= sz;
        gen(c2, &cs);
        if (fxch)
            genf2(c2,0xD9,0xC8 + 1);            // FXCH ST(1)
        if (tysize[TYldouble] == 12)
        {
            if (op1 == 0xDB)
            {
                cs.Iop = 0xC7;                      // MOV EA+10,0
                NEWREG(cs.Irm, 0);
                cs.IEV1.sp.Voffset += 10;
                cs.IFL2 = FLconst;
                cs.IEV2.Vint = 0;
                cs.Iflags |= CFopsize;
                c2 = gen(c2, &cs);
                cs.IEVoffset1 += 12;
                c2 = gen(c2, &cs);                  // MOV EA+22,0
            }
        }
        if (tysize[TYldouble] == 16)
        {
            if (op1 == 0xDB)
            {
                cs.Iop = 0xC7;                      // MOV EA+10,0
                NEWREG(cs.Irm, 0);
                cs.IEV1.sp.Voffset += 10;
                cs.IFL2 = FLconst;
                cs.IEV2.Vint = 0;
                cs.Iflags |= CFopsize;
                c2 = gen(c2, &cs);

                cs.IEV1.sp.Voffset += 2;
                cs.Iflags &= ~CFopsize;
                c2 = gen(c2, &cs);

                cs.IEV1.sp.Voffset += 14;
                cs.Iflags |= CFopsize;
                c2 = gen(c2, &cs);

                cs.IEV1.sp.Voffset += 2;
                cs.Iflags &= ~CFopsize;
                c2 = gen(c2, &cs);
            }
        }
        c2 = genfwait(c2);
        freenode(e->E1);
        return cat3(c1,c2,fixresult_complex87(e,mST01 | mPSW,pretregs));
}

/*******************************
 * Perform an assignment while converting to integral type,
 * i.e. handle (e1 = (int) e2)
 */

code *cnvteq87(elem *e,regm_t *pretregs)
{
        regm_t retregs;
        code *c1,*c2;
        code cs;
        unsigned op1;
        unsigned op2;

        assert(e->Eoper == OPeq);
        assert(!*pretregs);
        retregs = mST0;
        elem_debug(e->E2);
        c1 = codelem(e->E2->E1,&retregs,FALSE);

        switch (e->E2->Eoper)
        {   case OPd_s16:
                op1 = ESC(MFword,1);
                op2 = 3;
                break;
            case OPd_s32:
            case OPd_u16:
                op1 = ESC(MFlong,1);
                op2 = 3;
                break;
            case OPd_s64:
                op1 = 0xDF;
                op2 = 7;
                break;
            default:
                assert(0);
        }
        freenode(e->E2);

        c1 = genfwait(c1);
        c1 = genrnd(c1, CW_roundto0);   // FLDCW roundto0

        pop87();
        cs.Iflags = ADDFWAIT() ? CFwait : 0;
        if (e->E1->Eoper == OPvar)
            notreg(e->E1);                      // cannot be put in register anymore
        c2 = loadea(e->E1,&cs,op1,op2,0,0,0);

        c2 = genfwait(c2);
        c2 = genrnd(c2, CW_roundtonearest);     // FLDCW roundtonearest

        freenode(e->E1);
        return cat(c1,c2);
}

/**********************************
 * Perform +=, -=, *= and /= for doubles.
 */

code *opass87(elem *e,regm_t *pretregs)
{
        regm_t retregs;
        code *cl,*cr,*c;
        code cs;
        unsigned op;
        unsigned opld;
        unsigned op1;
        unsigned op2;
        tym_t ty1;

        ty1 = tybasic(e->E1->Ety);
        switch (ty1)
        {   case TYdouble_alias:
            case TYidouble:
            case TYdouble:      op1 = ESC(MFdouble,1);  op2 = 3; break;
            case TYifloat:
            case TYfloat:       op1 = ESC(MFfloat,1);   op2 = 3; break;
            case TYildouble:
            case TYldouble:     op1 = 0xDB;             op2 = 7; break;

            case TYcfloat:
            case TYcdouble:
            case TYcldouble:
                return (e->Eoper == OPmodass)
                        ? opmod_complex87(e, pretregs)
                        : opass_complex87(e, pretregs);

            default:
                assert(0);
        }
        switch (e->Eoper)
        {   case OPpostinc:
            case OPaddass:      op = 0 << 3;    opld = 0xC1;    break;  // FADD
            case OPpostdec:
            case OPminass:      op = 5 << 3;    opld = 0xE1; /*0xE9;*/  break;  // FSUBR
            case OPmulass:      op = 1 << 3;    opld = 0xC9;    break;  // FMUL
            case OPdivass:      op = 7 << 3;    opld = 0xF1;    break;  // FDIVR
            case OPmodass:      break;
            default:            assert(0);
        }
        retregs = mST0;
        cr = codelem(e->E2,&retregs,FALSE);     // evaluate rvalue
        note87(e->E2,0,0);
        cl = getlvalue(&cs,e->E1,e->Eoper==OPmodass?mAX:0);
        cl = cat(cl,makesure87(e->E2,0,0,0));
        cs.Iflags |= ADDFWAIT() ? CFwait : 0;
        if (I32)
            cs.Iflags &= ~CFopsize;
        if (config.flags4 & CFG4fdivcall && e->Eoper == OPdivass)
        {
            c = push87();
            cs.Iop = op1;
            if (ty1 == TYldouble || ty1 == TYildouble)
                cs.Irm |= modregrm(0, 5, 0);    // FLD tbyte ptr ...
            c = gen(c,&cs);
            c = genf2(c,0xD9,0xC8 + 1);         // FXCH ST(1)
            c = cat(c,callclib(e,CLIBfdiv87,&retregs,0));
            pop87();
        }
        else if (e->Eoper == OPmodass)
        {
            /*
             *          fld     tbyte ptr y
             *          fld     tbyte ptr x             // ST = x, ST1 = y
             *  FM1:    // We don't use fprem1 because for some inexplicable
             *          // reason we get -5 when we do _modulo(15, 10)
             *          fprem                           // ST = ST % ST1
             *          fstsw   word ptr sw
             *          fwait
             *          mov     AH,byte ptr sw+1        // get msb of status word in AH
             *          sahf                            // transfer to flags
             *          jp      FM1                     // continue till ST < ST1
             *          fstp    ST(1)                   // leave remainder on stack
             */
            code *c1;

            c = push87();
            cs.Iop = op1;
            if (ty1 == TYldouble || ty1 == TYildouble)
                cs.Irm |= modregrm(0, 5, 0);    // FLD tbyte ptr ...
            c = gen(c,&cs);                     // FLD   e->E1

            c1 = gen2(NULL, 0xD9, 0xF8);        // FPREM
            c1 = cg87_87topsw(c1);
            c1 = genjmp(c1, JP, FLcode, (block *)c1);   // JP FM1
            c1 = genf2(c1,0xDD,0xD8 + 1);       // FSTP ST(1)
            c = cat(c,c1);

            pop87();
        }
        else if (ty1 == TYldouble || ty1 == TYildouble)
        {
            c = push87();
            cs.Iop = op1;
            cs.Irm |= modregrm(0, 5, 0);        // FLD tbyte ptr ...
            c = gen(c,&cs);                     // FLD   e->E1
            genf2(c,0xDE,opld);                 // FopP  ST(1)
            pop87();
        }
        else
        {   cs.Iop = op1 & ~1;
            cs.Irm |= op;
            c = gen(CNIL,&cs);                  // Fop e->E1
        }
        if (*pretregs & mPSW)
            genftst(c,e,0);                     // FTST ST0
        /* if want result in registers  */
        if (*pretregs & (mST0 | ALLREGS | mBP))
        {
            if (ty1 == TYldouble || ty1 == TYildouble)
            {
                c = cat(c,push87());
                c = genf2(c,0xD9,0xC0);         // FLD ST(0)
                pop87();
            }
            else
                op2 = 2;                        // FST e->E1
        }
        else
        {                                       // FSTP
            pop87();
        }
        cs.Iop = op1;
        NEWREG(cs.Irm,op2);                     // FSTx e->E1
        freenode(e->E1);
        gen(c,&cs);
        genfwait(c);
        return cat4(cr,cl,c,fixresult87(e,mST0 | mPSW,pretregs));
}

/***********************************
 * Perform %= where E1 is complex and E2 is real or imaginary.
 */

code *opmod_complex87(elem *e,regm_t *pretregs)
{
    regm_t retregs;
    code *cl,*cr,*c;
    code cs;
    tym_t ty1;
    unsigned sz2;

    /*          fld     E2
                fld     E1.re
        FM1:    fprem
                fstsw   word ptr sw
                fwait
                mov     AH, byte ptr sw+1
                jp      FM1
                fxch    ST(1)
                fld     E1.im
        FM2:    fprem
                fstsw   word ptr sw
                fwait
                mov     AH, byte ptr sw+1
                jp      FM2
                fstp    ST(1)
     */

    ty1 = tybasic(e->E1->Ety);
    sz2 = tysize[ty1] / 2;

    retregs = mST0;
    cr = codelem(e->E2,&retregs,FALSE);         // FLD E2
    note87(e->E2,0,0);
    cl = getlvalue(&cs,e->E1,0);
    cl = cat(cl,makesure87(e->E2,0,0,0));
    cs.Iflags |= ADDFWAIT() ? CFwait : 0;
    if (!I16)
        cs.Iflags &= ~CFopsize;

    c = push87();
    switch (ty1)
    {
        case TYcdouble:  cs.Iop = ESC(MFdouble,1);      break;
        case TYcfloat:   cs.Iop = ESC(MFfloat,1);       break;
        case TYcldouble: cs.Iop = 0xDB; cs.Irm |= modregrm(0, 5, 0); break;
        default:
            assert(0);
    }
    c = gen(c,&cs);                             // FLD E1.re

    code *c1;

    c1 = gen2(NULL, 0xD9, 0xF8);                // FPREM
    c1 = cg87_87topsw(c1);
    c1 = genjmp(c1, JP, FLcode, (block *)c1);   // JP FM1
    c1 = genf2(c1, 0xD9, 0xC8 + 1);             // FXCH ST(1)
    c = cat(c,c1);

    c = cat(c, push87());
    cs.IEVoffset1 += sz2;
    gen(c, &cs);                                // FLD E1.im

    c1 = gen2(NULL, 0xD9, 0xF8);                // FPREM
    c1 = cg87_87topsw(c1);
    c1 = genjmp(c1, JP, FLcode, (block *)c1);   // JP FM2
    c1 = genf2(c1,0xDD,0xD8 + 1);               // FSTP ST(1)
    c = cat(c,c1);

    pop87();

    if (*pretregs & (mST01 | mPSW))
    {
        cs.Irm |= modregrm(0, 2, 0);
        gen(c, &cs);            // FST mreal.im
        cs.IEVoffset1 -= sz2;
        gen(c, &cs);            // FST mreal.re
        retregs = mST01;
    }
    else
    {
        cs.Irm |= modregrm(0, 3, 0);
        gen(c, &cs);            // FSTP mreal.im
        cs.IEVoffset1 -= sz2;
        gen(c, &cs);            // FSTP mreal.re
        pop87();
        pop87();
        retregs = 0;
    }
    freenode(e->E1);
    genfwait(c);
    return cat4(cr,cl,c,fixresult_complex87(e,retregs,pretregs));
}

/**********************************
 * Perform +=, -=, *= and /= for the lvalue being complex.
 */

code *opass_complex87(elem *e,regm_t *pretregs)
{
    regm_t retregs;
    regm_t idxregs;
    code *cl,*cr,*c;
    code cs;
    unsigned op;
    unsigned op2;
    tym_t ty1;
    unsigned sz2;

    ty1 = tybasic(e->E1->Ety);
    sz2 = tysize[ty1] / 2;
    switch (e->Eoper)
    {   case OPpostinc:
        case OPaddass:  op = 0 << 3;            // FADD
                        op2 = 0xC0;             // FADDP ST(i),ST
                        break;
        case OPpostdec:
        case OPminass:  op = 5 << 3;            // FSUBR
                        op2 = 0xE0;             // FSUBRP ST(i),ST
                        break;
        case OPmulass:  op = 1 << 3;            // FMUL
                        op2 = 0xC8;             // FMULP ST(i),ST
                        break;
        case OPdivass:  op = 7 << 3;            // FDIVR
                        op2 = 0xF0;             // FDIVRP ST(i),ST
                        break;
        default:        assert(0);
    }

    if (!tycomplex(e->E2->Ety) &&
        (e->Eoper == OPmulass || e->Eoper == OPdivass))
    {
        retregs = mST0;
        cr = codelem(e->E2, &retregs, FALSE);
        note87(e->E2, 0, 0);
        cl = getlvalue(&cs, e->E1, 0);
        cl = cat(cl,makesure87(e->E2,0,0,0));
        cl = cat(cl,push87());
        cl = genf2(cl,0xD9,0xC0);               // FLD ST(0)
        goto L1;
    }
    else
    {
        cr = loadComplex(e->E2);
        cl = getlvalue(&cs,e->E1,0);
        cl = cat(cl,makesure87(e->E2,sz2,0,0));
        cl = cat(cl,makesure87(e->E2,0,1,0));
    }
    cs.Iflags |= ADDFWAIT() ? CFwait : 0;
    if (!I16)
        cs.Iflags &= ~CFopsize;

    switch (e->Eoper)
    {
        case OPpostinc:
        case OPaddass:
        case OPpostdec:
        case OPminass:
        L1:
            if (ty1 == TYcldouble)
            {
                c = push87();
                c = cat(c, push87());
                cs.Iop = 0xDB;
                cs.Irm |= modregrm(0, 5, 0);    // FLD tbyte ptr ...
                c = gen(c,&cs);                 // FLD e->E1.re
                cs.IEVoffset1 += sz2;
                gen(c,&cs);                     // FLD e->E1.im
                genf2(c, 0xDE, op2 + 2);        // FADDP/FSUBRP ST(2),ST
                genf2(c, 0xDE, op2 + 2);        // FADDP/FSUBRP ST(2),ST
                pop87();
                pop87();
                if (tyimaginary(e->E2->Ety))
                {
                    if (e->Eoper == OPmulass)
                    {
                        genf2(c, 0xD9, 0xE0);   // FCHS
                        genf2(c, 0xD9, 0xC8+1); // FXCH ST(1)
                    }
                    else if (e->Eoper == OPdivass)
                    {
                        genf2(c, 0xD9, 0xC8+1); // FXCH ST(1)
                        genf2(c, 0xD9, 0xE0);   // FCHS
                    }
                }
            L2:
                if (*pretregs & (mST01 | mPSW))
                {
                    c = cat(c,push87());
                    c = cat(c,push87());
                    c = genf2(c,0xD9,0xC1);     // FLD ST(1)
                    c = genf2(c,0xD9,0xC1);     // FLD ST(1)
                    retregs = mST01;
                }
                else
                    retregs = 0;
                cs.Iop = 0xDB;
                cs.Irm |= modregrm(0,7,0);
                gen(c,&cs);                     // FSTP e->E1.im
                cs.IEVoffset1 -= sz2;
                gen(c,&cs);                     // FSTP e->E1.re
                pop87();
                pop87();

            }
            else
            {   unsigned char rmop = cs.Irm | op;
                unsigned char rmfst = cs.Irm | modregrm(0,2,0);
                unsigned char rmfstp = cs.Irm | modregrm(0,3,0);
                unsigned char iopfst = (ty1 == TYcfloat) ? 0xD9 : 0xDD;
                unsigned char iop = (ty1 == TYcfloat) ? 0xD8 : 0xDC;

                cs.Iop = iop;
                cs.Irm = rmop;
                cs.IEVoffset1 += sz2;
                c = gen(NULL, &cs);             // FSUBR mreal.im
                if (tyimaginary(e->E2->Ety) && (e->Eoper == OPmulass || e->Eoper == OPdivass))
                {
                    if (e->Eoper == OPmulass)
                        genf2(c, 0xD9, 0xE0);           // FCHS
                    genf2(c,0xD9,0xC8 + 1);             // FXCH ST(1)
                    cs.IEVoffset1 -= sz2;
                    gen(c, &cs);                        // FMUL mreal.re
                    if (e->Eoper == OPdivass)
                        genf2(c, 0xD9, 0xE0);           // FCHS
                    if (*pretregs & (mST01 | mPSW))
                    {
                        cs.Iop = iopfst;
                        cs.Irm = rmfst;
                        cs.IEVoffset1 += sz2;
                        gen(c, &cs);                    // FST mreal.im
                        genf2(c,0xD9,0xC8 + 1);         // FXCH ST(1)
                        cs.IEVoffset1 -= sz2;
                        gen(c, &cs);                    // FST mreal.re
                        genf2(c,0xD9,0xC8 + 1);         // FXCH ST(1)
                        retregs = mST01;
                    }
                    else
                    {
                        cs.Iop = iopfst;
                        cs.Irm = rmfstp;
                        cs.IEVoffset1 += sz2;
                        gen(c, &cs);                    // FSTP mreal.im
                        pop87();
                        cs.IEVoffset1 -= sz2;
                        gen(c, &cs);                    // FSTP mreal.re
                        pop87();
                        retregs = 0;
                    }
                    goto L3;
                }

                if (*pretregs & (mST01 | mPSW))
                {
                    cs.Iop = iopfst;
                    cs.Irm = rmfst;
                    gen(c, &cs);                // FST mreal.im
                    genf2(c,0xD9,0xC8 + 1);     // FXCH ST(1)
                    cs.Iop = iop;
                    cs.Irm = rmop;
                    cs.IEVoffset1 -= sz2;
                    gen(c, &cs);                // FSUBR mreal.re
                    cs.Iop = iopfst;
                    cs.Irm = rmfst;
                    gen(c, &cs);                // FST mreal.re
                    genf2(c,0xD9,0xC8 + 1);     // FXCH ST(1)
                    retregs = mST01;
                }
                else
                {
                    cs.Iop = iopfst;
                    cs.Irm = rmfstp;
                    gen(c, &cs);                // FSTP mreal.im
                    pop87();
                    cs.Iop = iop;
                    cs.Irm = rmop;
                    cs.IEVoffset1 -= sz2;
                    gen(c, &cs);                // FSUBR mreal.re
                    cs.Iop = iopfst;
                    cs.Irm = rmfstp;
                    gen(c, &cs);                // FSTP mreal.re
                    pop87();
                    retregs = 0;
                }
            }
        L3:
            freenode(e->E1);
            genfwait(c);
            return cat4(cr,cl,c,fixresult_complex87(e,retregs,pretregs));

        case OPmulass:
            c = push87();
            c = cat(c, push87());
            if (ty1 == TYcldouble)
            {
                cs.Iop = 0xDB;
                cs.Irm |= modregrm(0, 5, 0);    // FLD tbyte ptr ...
                c = gen(c,&cs);                 // FLD e->E1.re
                cs.IEVoffset1 += sz2;
                gen(c,&cs);                     // FLD e->E1.im
                retregs = mST01;
                c = cat(c,callclib(e, CLIBcmul, &retregs, 0));
                goto L2;
            }
            else
            {
                cs.Iop = (ty1 == TYcfloat) ? 0xD9 : 0xDD;
                cs.Irm |= modregrm(0, 0, 0);    // FLD tbyte ptr ...
                c = gen(c,&cs);                 // FLD e->E1.re
                cs.IEVoffset1 += sz2;
                gen(c,&cs);                     // FLD e->E1.im
                retregs = mST01;
                c = cat(c,callclib(e, CLIBcmul, &retregs, 0));
                if (*pretregs & (mST01 | mPSW))
                {
                    cs.Irm |= modregrm(0, 2, 0);
                    gen(c, &cs);                // FST mreal.im
                    cs.IEVoffset1 -= sz2;
                    gen(c, &cs);                // FST mreal.re
                    retregs = mST01;
                }
                else
                {
                    cs.Irm |= modregrm(0, 3, 0);
                    gen(c, &cs);                // FSTP mreal.im
                    cs.IEVoffset1 -= sz2;
                    gen(c, &cs);                // FSTP mreal.re
                    pop87();
                    pop87();
                    retregs = 0;
                }
                goto L3;
            }

        case OPdivass:
            c = push87();
            c = cat(c, push87());
            idxregs = idxregm(&cs);             // mask of index regs used
            if (ty1 == TYcldouble)
            {
                cs.Iop = 0xDB;
                cs.Irm |= modregrm(0, 5, 0);    // FLD tbyte ptr ...
                c = gen(c,&cs);                 // FLD e->E1.re
                genf2(c,0xD9,0xC8 + 2);         // FXCH ST(2)
                cs.IEVoffset1 += sz2;
                gen(c,&cs);                     // FLD e->E1.im
                genf2(c,0xD9,0xC8 + 2);         // FXCH ST(2)
                retregs = mST01;
                c = cat(c,callclib(e, CLIBcdiv, &retregs, idxregs));
                goto L2;
            }
            else
            {
                cs.Iop = (ty1 == TYcfloat) ? 0xD9 : 0xDD;
                cs.Irm |= modregrm(0, 0, 0);    // FLD tbyte ptr ...
                c = gen(c,&cs);                 // FLD e->E1.re
                genf2(c,0xD9,0xC8 + 2);         // FXCH ST(2)
                cs.IEVoffset1 += sz2;
                gen(c,&cs);                     // FLD e->E1.im
                genf2(c,0xD9,0xC8 + 2);         // FXCH ST(2)
                retregs = mST01;
                c = cat(c,callclib(e, CLIBcdiv, &retregs, idxregs));
                if (*pretregs & (mST01 | mPSW))
                {
                    cs.Irm |= modregrm(0, 2, 0);
                    gen(c, &cs);                // FST mreal.im
                    cs.IEVoffset1 -= sz2;
                    gen(c, &cs);                // FST mreal.re
                    retregs = mST01;
                }
                else
                {
                    cs.Irm |= modregrm(0, 3, 0);
                    gen(c, &cs);                // FSTP mreal.im
                    cs.IEVoffset1 -= sz2;
                    gen(c, &cs);                // FSTP mreal.re
                    pop87();
                    pop87();
                    retregs = 0;
                }
                goto L3;
            }

        default:
            assert(0);
    }
    return NULL;
}

/**************************
 * OPnegass
 */

code *cdnegass87(elem *e,regm_t *pretregs)
{   regm_t retregs;
    tym_t tyml;
    unsigned op;
    code *cl,*cr,*c,cs;
    elem *e1;
    int sz;

    //printf("cdnegass87(e = %p, *pretregs = x%x)\n", e, *pretregs);
    e1 = e->E1;
    tyml = tybasic(e1->Ety);            // type of lvalue
    sz = tysize[tyml];

    cl = getlvalue(&cs,e1,0);
    cr = modEA(&cs);
    cs.Irm |= modregrm(0,6,0);
    cs.Iop = 0x80;
    cs.Irex = 0;
#if LNGDBLSIZE > 10
    if (tyml == TYldouble || tyml == TYildouble)
        cs.IEVoffset1 += 10 - 1;
    else if (tyml == TYcldouble)
        cs.IEVoffset1 += tysize[TYldouble] + 10 - 1;
    else
#endif
        cs.IEVoffset1 += sz - 1;
    cs.IFL2 = FLconst;
    cs.IEV2.Vuns = 0x80;
    c = gen(NULL,&cs);                  // XOR 7[EA],0x80
    if (tycomplex(tyml))
    {
        cs.IEVoffset1 -= sz / 2;
        gen(c,&cs);                     // XOR 7[EA],0x80
    }
    c = cat3(cl,cr,c);

    if (*pretregs)
    {
        switch (tyml)
        {
            case TYifloat:
            case TYfloat:               cs.Iop = 0xD9;  op = 0; break;
            case TYidouble:
            case TYdouble:
            case TYdouble_alias:        cs.Iop = 0xDD;  op = 0; break;
            case TYildouble:
            case TYldouble:             cs.Iop = 0xDB;  op = 5; break;
            default:
                assert(0);
        }
        NEWREG(cs.Irm,op);
        cs.IEVoffset1 -= sz - 1;
        c = cat(c, push87());
        c = gen(c,&cs);                 // FLD EA
        retregs = mST0;
    }
    else
        retregs = 0;

    freenode(e1);
    return cat(c,fixresult87(e,retregs,pretregs));
}

/************************
 * Take care of OPpostinc and OPpostdec.
 */

code *post87(elem *e,regm_t *pretregs)
{
        regm_t retregs;
        code *cl,*cr,*c;
        code cs;
        unsigned op;
        unsigned op1;
        unsigned reg;
        tym_t ty1;

        //printf("post87()\n");
        assert(*pretregs);
        cl = getlvalue(&cs,e->E1,0);
        cs.Iflags |= ADDFWAIT() ? CFwait : 0;
        if (!I16)
            cs.Iflags &= ~CFopsize;
        ty1 = tybasic(e->E1->Ety);
        switch (ty1)
        {   case TYdouble_alias:
            case TYidouble:
            case TYdouble:
            case TYcdouble:     op1 = ESC(MFdouble,1);  reg = 0;        break;
            case TYifloat:
            case TYfloat:
            case TYcfloat:      op1 = ESC(MFfloat,1);   reg = 0;        break;
            case TYildouble:
            case TYldouble:
            case TYcldouble:    op1 = 0xDB;             reg = 5;        break;
            default:
                assert(0);
        }
        NEWREG(cs.Irm, reg);
        if (reg == 5)
            reg = 7;
        else
            reg = 3;
        cs.Iop = op1;
        cl = cat(cl,push87());
        cl = gen(cl,&cs);               // FLD e->E1
        if (tycomplex(ty1))
        {   unsigned sz = tysize[ty1] / 2;

            cl = cat(cl,push87());
            cs.IEVoffset1 += sz;
            cl = gen(cl,&cs);           // FLD e->E1
            retregs = mST0;             // note kludge to only load real part
            cr = codelem(e->E2,&retregs,FALSE); // load rvalue
            c = genf2(NULL,0xD8,        // FADD/FSUBR ST,ST2
                (e->Eoper == OPpostinc) ? 0xC0 + 2 : 0xE8 + 2);
            NEWREG(cs.Irm,reg);
            pop87();
            cs.IEVoffset1 -= sz;
            gen(c,&cs);                 // FSTP e->E1
            genfwait(c);
            freenode(e->E1);
            return cat4(cl, cr, c, fixresult_complex87(e, mST01, pretregs));
        }

        if (*pretregs & (mST0 | ALLREGS | mBP))
        {   // Want the result in a register
            cl = cat(cl,push87());
            genf2(cl,0xD9,0xC0);        // FLD ST0
        }
        if (*pretregs & mPSW)           /* if result in flags           */
            genftst(cl,e,0);            // FTST ST0
        retregs = mST0;
        cr = codelem(e->E2,&retregs,FALSE);     /* load rvalue          */
        pop87();
        op = (e->Eoper == OPpostinc) ? modregrm(3,0,1) : modregrm(3,5,1);
        c = genf2(NULL,0xDE,op);        // FADDP/FSUBRP ST1
        NEWREG(cs.Irm,reg);
        pop87();
        gen(c,&cs);                     /* FSTP e->E1                   */
        genfwait(c);
        freenode(e->E1);
        return cat4(cl,cr,c,fixresult87(e,mPSW | mST0,pretregs));
}

/************************
 * Do the following opcodes:
 *      OPd_s16
 *      OPd_s32
 *      OPd_u16
 *      OPd_s64
 */

code *cnvt87(elem *e,regm_t *pretregs)
{
        regm_t retregs;
        code *c1,*c2;
        unsigned mf,rf,reg;
        tym_t tym;
        int clib;
        int sz;
        int szoff;

        //printf("cnvt87(e = %p, *pretregs = x%x)\n", e, *pretregs);
        assert(*pretregs);
        tym = e->Ety;
        sz = tysize(tym);
        szoff = sz;
        unsigned grex = I64 ? REX_W << 16 : 0;

        switch (e->Eoper)
        {   case OPd_s16:
                clib = CLIBdblint87;
                mf = ESC(MFword,1);
                rf = 3;
                break;

            case OPd_u16:
                szoff = 4;
            case OPd_s32:
                clib = CLIBdbllng87;
                mf = ESC(MFlong,1);
                rf = 3;
                break;

            case OPd_s64:
                clib = CLIBdblllng;
                mf = 0xDF;
                rf = 7;
                break;

            default:
                assert(0);
        }

        if (I16)                       // C may change the default control word
        {
            if (clib == CLIBdblllng)
            {   retregs = I32 ? DOUBLEREGS_32 : DOUBLEREGS_16;
                c1 = codelem(e->E1,&retregs,FALSE);
                c2 = callclib(e,clib,pretregs,0);
            }
            else
            {   retregs = mST0; //I32 ? DOUBLEREGS_32 : DOUBLEREGS_16;
                c1 = codelem(e->E1,&retregs,FALSE);
                c2 = callclib(e,clib,pretregs,0);
                pop87();
            }
        }
        else if (1)
        {   //  Generate:
            //  sub     ESP,12
            //  fstcw   8[ESP]
            //  fldcw   roundto0
            //  fistp   long64 ptr [ESP]
            //  fldcw   8[ESP]
            //  pop     lsw
            //  pop     msw
            //  add     ESP,4

            unsigned szpush = szoff + 2;
            if (config.flags3 & CFG3pic)
                szpush += 2;
            szpush = (szpush + REGSIZE - 1) & ~(REGSIZE - 1);

            retregs = mST0;
            c1 = codelem(e->E1,&retregs,FALSE);

            if (szpush == REGSIZE)
                c1 = gen1(c1,0x50 + AX);                // PUSH EAX
            else
                c1 = genc2(c1,0x81,grex | modregrm(3,5,SP), szpush);   // SUB ESP,12
            c1 = genfwait(c1);
            genc1(c1,0xD9,modregrm(2,7,4) + 256*modregrm(0,4,SP),FLconst,szoff); // FSTCW szoff[ESP]

            c1 = genfwait(c1);

            if (config.flags3 & CFG3pic)
            {
                genc(c1,0xC7,modregrm(2,0,4) + 256*modregrm(0,4,SP),FLconst,szoff+2,FLconst,CW_roundto0); // MOV szoff+2[ESP], CW_roundto0
                code_orflag(c1, CFopsize);
                genc1(c1,0xD9,modregrm(2,5,4) + 256*modregrm(0,4,SP),FLconst,szoff+2); // FLDCW szoff+2[ESP]
            }
            else
                c1 = genrnd(c1, CW_roundto0);   // FLDCW roundto0

            pop87();

            c1 = genfwait(c1);
            gen2sib(c1,mf,grex | modregrm(0,rf,4),modregrm(0,4,SP));                   // FISTP [ESP]

            retregs = *pretregs & (ALLREGS | mBP);
            if (!retregs)
                    retregs = ALLREGS;
            c2 = allocreg(&retregs,&reg,tym);

            c2 = genfwait(c2);                                                          // FWAIT
            c2 = genc1(c2,0xD9,grex | modregrm(2,5,4) + 256*modregrm(0,4,SP),FLconst,szoff);   // FLDCW szoff[ESP]

            if (szoff > REGSIZE)
            {   szpush -= REGSIZE;
                c2 = genpop(c2,findreglsw(retregs));       // POP lsw
            }
            szpush -= REGSIZE;
            c2 = genpop(c2,reg);                           // POP reg

            if (szpush)
                genc2(c2,0x81,grex | modregrm(3,0,SP), szpush);        // ADD ESP,4
            c2 = cat(c2,fixresult(e,retregs,pretregs));
        }
        else
        {
            // This is incorrect. For -inf and nan, the 8087 returns the largest
            // negative int (0x80000....). For -inf, 0x7FFFF... should be returned,
            // and for nan, 0 should be returned.
            retregs = mST0;
            c1 = codelem(e->E1,&retregs,FALSE);

            c1 = genfwait(c1);
            c1 = genrnd(c1, CW_roundto0);       // FLDCW roundto0

            pop87();
            c1 = genfltreg(c1,mf,rf,0);         // FISTP floatreg
            retregs = *pretregs & (ALLREGS | mBP);
            if (!retregs)
                    retregs = ALLREGS;
            c2 = allocreg(&retregs,&reg,tym);

            c2 = genfwait(c2);

            if (sz > REGSIZE)
            {   c2 = genfltreg(c2,0x8B,reg,REGSIZE);    // MOV reg,floatreg + REGSIZE
                                                        // MOV lsreg,floatreg
                genfltreg(c2,0x8B,findreglsw(retregs),0);
            }
            else
                c2 = genfltreg(c2,0x8B,reg,0);  // MOV reg,floatreg
            c2 = genrnd(c2, CW_roundtonearest); // FLDCW roundtonearest
            c2 = cat(c2,fixresult(e,retregs,pretregs));
        }
        return cat(c1,c2);
}

/************************
 * Do OPrndtol.
 */

code *cdrndtol(elem *e,regm_t *pretregs)
{
        regm_t retregs;
        code *c1,*c2;
        unsigned reg;
        tym_t tym;
        unsigned sz;
        unsigned char op1,op2;

        if (*pretregs == 0)
            return codelem(e->E1,pretregs,FALSE);
        tym = e->Ety;
        retregs = mST0;
        c1 = codelem(e->E1,&retregs,FALSE);

        sz = tysize(tym);
        switch (sz)
        {   case 2:
                op1 = 0xDF;
                op2 = 3;
                break;
            case 4:
                op1 = 0xDB;
                op2 = 3;
                break;
            case 8:
                op1 = 0xDF;
                op2 = 7;
                break;
            default:
                assert(0);
        }

        pop87();
        c1 = genfltreg(c1,op1,op2,0);           // FISTP floatreg
        retregs = *pretregs & (ALLREGS | mBP);
        if (!retregs)
                retregs = ALLREGS;
        c2 = allocreg(&retregs,&reg,tym);
        c2 = genfwait(c2);                      // FWAIT
        if (tysize(tym) > REGSIZE)
        {   c2 = genfltreg(c2,0x8B,reg,REGSIZE);        // MOV reg,floatreg + REGSIZE
                                                        // MOV lsreg,floatreg
            genfltreg(c2,0x8B,findreglsw(retregs),0);
        }
        else
        {
            c2 = genfltreg(c2,0x8B,reg,0);      // MOV reg,floatreg
            if (tysize(tym) == 8 && I64)
                code_orrex(c2, REX_W);
        }
        c2 = cat(c2,fixresult(e,retregs,pretregs));

        return cat(c1,c2);
}

/*************************
 * Do OPscale, OPyl2x, OPyl2xp1.
 */

code *cdscale(elem *e,regm_t *pretregs)
{
    regm_t retregs;
    code *c1,*c2,*c3;

    assert(*pretregs != 0);

    retregs = mST0;
    c1 = codelem(e->E1,&retregs,FALSE);
    note87(e->E1,0,0);
    c2 = codelem(e->E2,&retregs,FALSE);
    c2 = cat(c2,makesure87(e->E1,0,1,0));       // now have x,y on stack; need y,x
    switch (e->Eoper)
    {
        case OPscale:
            c2 = genf2(c2,0xD9,0xFD);                   // FSCALE
            genf2(c2,0xDD,0xD8 + 1);                    // FSTP ST(1)
            break;

        case OPyl2x:
            c2 = genf2(c2,0xD9,0xF1);                   // FYL2X
            break;

        case OPyl2xp1:
            c2 = genf2(c2,0xD9,0xF9);                   // FYL2XP1
            break;
    }
    pop87();
    c3 = fixresult87(e,mST0,pretregs);
    return cat3(c1,c2,c3);
}


/**********************************
 * Unary -, absolute value, square root, sine, cosine
 */

code *neg87(elem *e,regm_t *pretregs)
{
        regm_t retregs;
        code *c1,*c2;
        int op;

        assert(*pretregs);
        switch (e->Eoper)
        {   case OPneg:  op = 0xE0;     break;
            case OPabs:  op = 0xE1;     break;
            case OPsqrt: op = 0xFA;     break;
            case OPsin:  op = 0xFE;     break;
            case OPcos:  op = 0xFF;     break;
            case OPrint: op = 0xFC;     break;  // FRNDINT
            default:
                assert(0);
        }
        retregs = mST0;
        c1 = codelem(e->E1,&retregs,FALSE);
        c1 = genf2(c1,0xD9,op);                 // FCHS/FABS/FSQRT/FSIN/FCOS/FRNDINT
        c2 = fixresult87(e,mST0,pretregs);
        return cat(c1,c2);
}

/**********************************
 * Unary - for complex operands
 */

code *neg_complex87(elem *e,regm_t *pretregs)
{
    regm_t retregs;
    code *c1,*c2;

    assert(e->Eoper == OPneg);
    retregs = mST01;
    c1 = codelem(e->E1,&retregs,FALSE);
    c1 = genf2(c1,0xD9,0xE0);           // FCHS
    genf2(c1,0xD9,0xC8 + 1);            // FXCH ST(1)
    genf2(c1,0xD9,0xE0);                // FCHS
    genf2(c1,0xD9,0xC8 + 1);            // FXCH ST(1)
    c2 = fixresult_complex87(e,mST01,pretregs);
    return cat(c1,c2);
}

/*********************************
 */

code *cdind87(elem *e,regm_t *pretregs)
{   code *c,*ce,cs;

    //printf("cdind87(e = %p, *pretregs = x%x)\n",e,*pretregs);

    c = getlvalue(&cs,e,0);             // get addressing mode
    if (*pretregs)
    {
        switch (tybasic(e->Ety))
        {   case TYfloat:
            case TYifloat:
                cs.Iop = 0xD9;
                break;

            case TYidouble:
            case TYdouble:
            case TYdouble_alias:
                cs.Iop = 0xDD;
                break;

            case TYildouble:
            case TYldouble:
                cs.Iop = 0xDB;
                cs.Irm |= modregrm(0,5,0);
                break;

            default:
                assert(0);
        }
        c = cat(c,push87());
        c = gen(c,&cs);                 // FLD EA
        ce = fixresult87(e,mST0,pretregs);
        c = cat(c,ce);
    }
    return c;
}

/************************************
 * Reset statics for another .obj file.
 */

void cg87_reset()
{
    memset(&oldd,0,sizeof(oldd));
}


/*****************************************
 * Initialize control word constants.
 */

STATIC code *genrnd(code *c, short cw)
{
    if (config.flags3 & CFG3pic)
    {   code *c1;

        c1 = genfltreg(NULL, 0xC7, 0, 0);       // MOV floatreg, cw
        c1->IFL2 = FLconst;
        c1->IEV2.Vuns = cw;

        c1 = genfltreg(c1, 0xD9, 5, 0);         // FLDCW floatreg
        c = cat(c, c1);
    }
    else
    {
        if (!oldd.round)                // if not initialized
        {   short cwi;

            oldd.round = 1;

            cwi = CW_roundto0;          // round to 0
            oldd.roundto0 = out_readonly_sym(TYshort,&cwi,2);
            cwi = CW_roundtonearest;            // round to nearest
            oldd.roundtonearest = out_readonly_sym(TYshort,&cwi,2);
        }
        symbol *rnddir = (cw == CW_roundto0) ? oldd.roundto0 : oldd.roundtonearest;
        code cs;
        cs.Iop = 0xD9;
        cs.Iflags = CFoff;
        cs.Irex = 0;
        cs.IEVsym1 = rnddir;
        cs.IFL1 = rnddir->Sfl;
        cs.IEVoffset1 = 0;
        cs.Irm = modregrm(0,5,BPRM);
        c = gen(c,&cs);
    }
    return c;
}

/************************* Complex Numbers *********************/

/***************************
 * Set the PSW based on the state of ST01.
 * Input:
 *      pop     if stack should be popped after test
 * Returns:
 *      start of code appended to c.
 */

STATIC code * genctst(code *c,elem *e,int pop)
#if __DMC__
__in
{
    assert(pop == 0 || pop == 1);
}
__body
#endif
{
    // Generate:
    //  if (pop)
    //          FLDZ
    //          FUCOMPP
    //          FSTSW   AX
    //          SAHF
    //          FLDZ
    //          FUCOMPP
    //          JNE     L1
    //          JP      L1              // if NAN
    //          FSTSW   AX
    //          SAHF
    //      L1:
    //  else
    //          FLDZ
    //          FUCOM
    //          FSTSW   AX
    //          SAHF
    //          FUCOMP  ST(2)
    //          JNE     L1
    //          JP      L1              // if NAN
    //          FSTSW   AX
    //          SAHF
    //      L1:
    // FUCOMP doesn't raise exceptions on QNANs, unlike FTST

    code *cnop;

    cnop = gennop(CNIL);
    c = cat(c,push87());
    c = gen2(c,0xD9,0xEE);              // FLDZ
    if (pop)
    {
        gen2(c,0xDA,0xE9);              // FUCOMPP
        pop87();
        pop87();
        cg87_87topsw(c);                // put 8087 flags in CPU flags
        gen2(c,0xD9,0xEE);              // FLDZ
        gen2(c,0xDA,0xE9);              // FUCOMPP
        pop87();
        genjmp(c,JNE,FLcode,(block *) cnop); // JNE     L1
        genjmp(c,JP, FLcode,(block *) cnop); // JP      L1
        cg87_87topsw(c);                // put 8087 flags in CPU flags
    }
    else
    {
        gen2(c,0xDD,0xE1);              // FUCOM
        cg87_87topsw(c);                // put 8087 flags in CPU flags
        gen2(c,0xDD,0xEA);              // FUCOMP ST(2)
        pop87();
        genjmp(c,JNE,FLcode,(block *) cnop); // JNE     L1
        genjmp(c,JP, FLcode,(block *) cnop); // JP      L1
        cg87_87topsw(c);                // put 8087 flags in CPU flags
    }
    return cat(c, cnop);
}

/******************************
 * Given the result of an expression is in retregs,
 * generate necessary code to return result in *pretregs.
 */


code *fixresult_complex87(elem *e,regm_t retregs,regm_t *pretregs)
{
    tym_t tym;
    code *c1,*c2;
    unsigned sz;

#if 0
    printf("fixresult_complex87(e = %p, retregs = %s, *pretregs = %s)\n",
        e,regm_str(retregs),regm_str(*pretregs));
#endif
    assert(!*pretregs || retregs);
    c1 = CNIL;
    c2 = CNIL;
    tym = tybasic(e->Ety);
    sz = tysize[tym];

    if (*pretregs == 0 && retregs == mST01)
    {
        c1 = genf2(c1,0xDD,modregrm(3,3,0));    // FPOP
        pop87();
        c1 = genf2(c1,0xDD,modregrm(3,3,0));    // FPOP
        pop87();
    }
    else if (tym == TYcfloat && *pretregs & (mAX|mDX) && retregs & mST01)
    {
        if (*pretregs & mPSW && !(retregs & mPSW))
            c1 = genctst(c1,e,0);               // FTST
        pop87();
        c1 = genfltreg(c1, ESC(MFfloat,1),3,0); // FSTP floatreg
        genfwait(c1);
        c2 = getregs(mDX|mAX);
        c2 = genfltreg(c2, 0x8B, DX, 0);        // MOV EDX,floatreg

        pop87();
        c2 = genfltreg(c2, ESC(MFfloat,1),3,0); // FSTP floatreg
        genfwait(c2);
        c2 = genfltreg(c2, 0x8B, AX, 0);        // MOV EAX,floatreg
    }
    else if (tym == TYcfloat && retregs & (mAX|mDX) && *pretregs & mST01)
    {
        c1 = push87();
        c1 = genfltreg(c1, 0x89, AX, 0);        // MOV floatreg, EAX
        genfltreg(c1, 0xD9, 0, 0);              // FLD float ptr floatreg

        c2 = push87();
        c2 = genfltreg(c2, 0x89, DX, 0);        // MOV floatreg, EDX
        genfltreg(c2, 0xD9, 0, 0);              // FLD float ptr floatreg

        if (*pretregs & mPSW)
            c2 = genctst(c2,e,0);               // FTST
    }
    else if ((tym == TYcfloat || tym == TYcdouble) &&
             *pretregs & (mXMM0|mXMM1) && retregs & mST01)
    {
        if (*pretregs & mPSW && !(retregs & mPSW))
            c1 = genctst(c1,e,0);               // FTST
        pop87();
        c1 = genfltreg(c1, ESC(MFdouble,1),3,0); // FSTP floatreg
        genfwait(c1);
        c2 = getregs(mXMM0|mXMM1);
        c2 = genfltreg(c2, 0xF20F10, XMM1 - XMM0, 0);    // MOVD XMM1,floatreg

        pop87();
        c2 = genfltreg(c2, ESC(MFdouble,1),3,0); // FSTP floatreg
        genfwait(c2);
        c2 = genfltreg(c2, 0xF20F10, XMM0 - XMM0, 0);    // MOVD XMM0,floatreg
    }
    else if ((tym == TYcfloat || tym == TYcdouble) &&
             retregs & (mXMM0|mXMM1) && *pretregs & mST01)
    {
        c1 = push87();
        c1 = genfltreg(c1, 0xF20F11, XMM0-XMM0, 0);        // MOVD floatreg, XMM0
        genfltreg(c1, 0xDD, 0, 0);              // FLD double ptr floatreg

        c2 = push87();
        c2 = genfltreg(c2, 0xF20F11, XMM1-XMM0, 0);        // MOV floatreg, XMM1
        genfltreg(c2, 0xDD, 0, 0);              // FLD double ptr floatreg

        if (*pretregs & mPSW)
            c2 = genctst(c2,e,0);               // FTST
    }
    else
    {   if (*pretregs & mPSW)
        {   if (!(retregs & mPSW))
            {   assert(retregs & mST01);
                c1 = genctst(c1,e,!(*pretregs & mST01));        // FTST
            }
        }
        assert(!(*pretregs & mST01) || (retregs & mST01));
    }
    if (*pretregs & mST01)
    {   note87(e,0,1);
        note87(e,sz/2,0);
    }
    return cat(c1,c2);
}

/*****************************************
 * Operators OPc_r and OPc_i
 */

code *cdconvt87(elem *e, regm_t *pretregs)
{
    regm_t retregs;
    code *c;

    retregs = mST01;
    c = codelem(e->E1, &retregs, FALSE);
    switch (e->Eoper)
    {
        case OPc_r:
            c = genf2(c,0xDD,0xD8 + 0); // FPOP
            pop87();
            break;

        case OPc_i:
            c = genf2(c,0xDD,0xD8 + 1); // FSTP ST(1)
            pop87();
            break;

        default:
            assert(0);
    }
    retregs = mST0;
    c = cat(c, fixresult87(e, retregs, pretregs));
    return c;
}

/**************************************
 * Load complex operand into ST01 or flags or both.
 */

code *cload87(elem *e, regm_t *pretregs)
#if __DMC__
__in
{
    assert(I32 && config.inline8087);
    elem_debug(e);
    assert(*pretregs & (mST01 | mPSW));
    assert(!(*pretregs & ~(mST01 | mPSW)));
}
__out (result)
{
}
__body
#endif
{
    tym_t ty = tybasic(e->Ety);
    code *c = NULL;
    code *cpush = NULL;
    code cs;
    unsigned mf;
    unsigned sz;
    unsigned char ldop;
    regm_t retregs;
    int i;

    //printf("cload87(e = %p, *pretregs = %s)\n", e, regm_str(*pretregs));
    sz = tysize[ty] / 2;
    memset(&cs, 0, sizeof(cs));
    if (ADDFWAIT())
        cs.Iflags = CFwait;
    switch (ty)
    {
        case TYcfloat:      mf = MFfloat;           break;
        case TYcdouble:     mf = MFdouble;          break;
        case TYcldouble:    break;
        default:            assert(0);
    }
    switch (e->Eoper)
    {
        case OPvar:
            notreg(e);                  // never enregister this variable
        case OPind:
            cpush = cat(push87(), push87());
            switch (ty)
            {
                case TYcfloat:
                case TYcdouble:
                    c = loadea(e,&cs,ESC(mf,1),0,0,0,0);        // FLD var
                    cs.IEVoffset1 += sz;
                    c = gen(c, &cs);
                    break;

                case TYcldouble:
                    c = loadea(e,&cs,0xDB,5,0,0,0);             // FLD var
                    cs.IEVoffset1 += sz;
                    c = gen(c, &cs);
                    break;

                default:
                    assert(0);
            }
            retregs = mST01;
            break;

        case OPd_ld:
        case OPld_d:
        case OPf_d:
        case OPd_f:
            c = cload87(e->E1, pretregs);
            freenode(e->E1);
            return c;

        case OPconst:
            cpush = cat(push87(), push87());
            for (i = 0; i < 2; i++)
            {
                ldop = loadconst(e, i);
                if (ldop)
                {
                    c = genf2(c,0xD9,ldop);             // FLDx
                }
                else
                {
                    assert(0);
                }
            }
            retregs = mST01;
            break;

        default:
#ifdef DEBUG
            elem_print(e);
#endif
            assert(0);
    }
    return cat4(cpush,c,fixresult_complex87(e, retregs, pretregs), NULL);
}

#endif // !SPP