ldc/backend/cod3.c

// Copyright (C) 1984-1998 by Symantec
// Copyright (C) 2000-2011 by Digital Mars
// All Rights Reserved
// http://www.digitalmars.com
// Written by Walter Bright
/*
 * This source file is made available for personal use
 * only. The license is in /dmd/src/dmd/backendlicense.txt
 * or /dm/src/dmd/backendlicense.txt
 * For any other uses, please contact Digital Mars.
 */

#if !SPP

#include        <stdio.h>
#include        <string.h>
#include        <stdlib.h>
#include        <time.h>
#include        "cc.h"
#include        "el.h"
#include        "code.h"
#include        "oper.h"
#include        "global.h"
#include        "type.h"
#include        "tinfo.h"
#if SCPP
#include        "exh.h"
#endif

#if HYDRATE
#include        "parser.h"
#endif

static char __file__[] = __FILE__;      /* for tassert.h                */
#include        "tassert.h"

extern targ_size_t retsize;
STATIC void pinholeopt_unittest();
STATIC void do8bit (enum FL,union evc *);
STATIC void do16bit (enum FL,union evc *,int);
STATIC void do32bit (enum FL,union evc *,int,targ_size_t = 0);
STATIC void do64bit (enum FL,union evc *,int);

static int hasframe;            /* !=0 if this function has a stack frame */
static targ_size_t Foff;        // BP offset of floating register
static targ_size_t CSoff;       // offset of common sub expressions
static targ_size_t NDPoff;      // offset of saved 8087 registers
int BPoff;                      // offset from BP
static int EBPtoESP;            // add to EBP offset to get ESP offset
static int AAoff;               // offset of alloca temporary

#if ELFOBJ || MACHOBJ
#define JMPSEG  CDATA
#define JMPOFF  CDoffset
#else
#define JMPSEG  DATA
#define JMPOFF  Doffset
#endif

/*************
 * Size in bytes of each instruction.
 * 0 means illegal instruction.
 * bit  M:      if there is a modregrm field (EV1 is reserved for modregrm)
 * bit  T:      if there is a second operand (EV2)
 * bit  E:      if second operand is only 8 bits
 * bit  A:      a short version exists for the AX reg
 * bit  R:      a short version exists for regs
 * bits 2..0:   size of instruction (excluding optional bytes)
 */

#define M 0x80
#define T 0x40
#define E 0x20
#define A 0x10
#define R 0x08
#define W 0

static unsigned char inssize[256] =
{       M|2,M|2,M|2,M|2,        T|E|2,T|3,1,1,          /* 00 */
        M|2,M|2,M|2,M|2,        T|E|2,T|3,1,1,          /* 08 */
        M|2,M|2,M|2,M|2,        T|E|2,T|3,1,1,          /* 10 */
        M|2,M|2,M|2,M|2,        T|E|2,T|3,1,1,          /* 18 */
        M|2,M|2,M|2,M|2,        T|E|2,T|3,1,1,          /* 20 */
        M|2,M|2,M|2,M|2,        T|E|2,T|3,1,1,          /* 28 */
        M|2,M|2,M|2,M|2,        T|E|2,T|3,1,1,          /* 30 */
        M|2,M|2,M|2,M|2,        T|E|2,T|3,1,1,          /* 38 */
        1,1,1,1,                1,1,1,1,                /* 40 */
        1,1,1,1,                1,1,1,1,                /* 48 */
        1,1,1,1,                1,1,1,1,                /* 50 */
        1,1,1,1,                1,1,1,1,                /* 58 */
        1,1,M|2,M|2,            1,1,1,1,                /* 60 */
        T|3,M|T|4,T|E|2,M|T|E|3, 1,1,1,1,               /* 68 */
        T|E|2,T|E|2,T|E|2,T|E|2, T|E|2,T|E|2,T|E|2,T|E|2,       /* 70 */
        T|E|2,T|E|2,T|E|2,T|E|2, T|E|2,T|E|2,T|E|2,T|E|2,       /* 78 */
        M|T|E|A|3,M|T|A|4,M|T|E|3,M|T|E|3,      M|2,M|2,M|2,M|A|R|2, /* 80 */
        M|A|2,M|A|2,M|A|2,M|A|2,        M|2,M|2,M|2,M|R|2,      /* 88 */
        1,1,1,1,                1,1,1,1,                /* 90 */
        1,1,T|5,1,              1,1,1,1,                /* 98 */
#if 0 /* cod3_set32() patches this */
        T|5,T|5,T|5,T|5,        1,1,1,1,                /* A0 */
#else
        T|3,T|3,T|3,T|3,        1,1,1,1,                /* A0 */
#endif
        T|E|2,T|3,1,1,          1,1,1,1,                /* A8 */
        T|E|2,T|E|2,T|E|2,T|E|2, T|E|2,T|E|2,T|E|2,T|E|2,       /* B0 */
        T|3,T|3,T|3,T|3,        T|3,T|3,T|3,T|3,                /* B8 */
        M|T|E|3,M|T|E|3,T|3,1,  M|2,M|2,M|T|E|R|3,M|T|R|4,      /* C0 */
        T|E|4,1,T|3,1,          1,T|E|2,1,1,            /* C8 */
        M|2,M|2,M|2,M|2,        T|E|2,T|E|2,0,1,        /* D0 */
        /* For the floating instructions, allow room for the FWAIT      */
        M|2,M|2,M|2,M|2,        M|2,M|2,M|2,M|2,        /* D8 */
        T|E|2,T|E|2,T|E|2,T|E|2, T|E|2,T|E|2,T|E|2,T|E|2,       /* E0 */
        T|3,T|3,T|5,T|E|2,              1,1,1,1,                /* E8 */
        1,0,1,1,                1,1,M|A|2,M|A|2,                /* F0 */
        1,1,1,1,                1,1,M|2,M|R|2                   /* F8 */
};

static const unsigned char inssize32[256] =
{       2,2,2,2,        2,5,1,1,                /* 00 */
        2,2,2,2,        2,5,1,1,                /* 08 */
        2,2,2,2,        2,5,1,1,                /* 10 */
        2,2,2,2,        2,5,1,1,                /* 18 */
        2,2,2,2,        2,5,1,1,                /* 20 */
        2,2,2,2,        2,5,1,1,                /* 28 */
        2,2,2,2,        2,5,1,1,                /* 30 */
        2,2,2,2,        2,5,1,1,                /* 38 */
        1,1,1,1,        1,1,1,1,                /* 40 */
        1,1,1,1,        1,1,1,1,                /* 48 */
        1,1,1,1,        1,1,1,1,                /* 50 */
        1,1,1,1,        1,1,1,1,                /* 58 */
        1,1,2,2,        1,1,1,1,                /* 60 */
        5,6,2,3,        1,1,1,1,                /* 68 */
        2,2,2,2,        2,2,2,2,                /* 70 */
        2,2,2,2,        2,2,2,2,                /* 78 */
        3,6,3,3,        2,2,2,2,                /* 80 */
        2,2,2,2,        2,2,2,2,                /* 88 */
        1,1,1,1,        1,1,1,1,                /* 90 */
        1,1,7,1,        1,1,1,1,                /* 98 */
        5,5,5,5,        1,1,1,1,                /* A0 */
        2,5,1,1,        1,1,1,1,                /* A8 */
        2,2,2,2,        2,2,2,2,                /* B0 */
        5,5,5,5,        5,5,5,5,                /* B8 */
        3,3,3,1,        2,2,3,6,                /* C0 */
        4,1,3,1,        1,2,1,1,                /* C8 */
        2,2,2,2,        2,2,0,1,                /* D0 */
        /* For the floating instructions, don't need room for the FWAIT */
        2,2,2,2,        2,2,2,2,                /* D8 */

        2,2,2,2,        2,2,2,2,                /* E0 */
        5,5,7,2,        1,1,1,1,                /* E8 */
        1,0,1,1,        1,1,2,2,                /* F0 */
        1,1,1,1,        1,1,2,2                 /* F8 */
};

/* For 2 byte opcodes starting with 0x0F        */
static unsigned char inssize2[256] =
{       M|3,M|3,M|3,M|3,        2,2,2,2,                // 00
        2,2,M|3,2,              2,2,2,M|T|E|4,          // 08
        M|3,M|3,M|3,M|3,        M|3,M|3,M|3,M|3,        // 10
        M|3,2,2,2,              2,2,2,2,                // 18
        M|3,M|3,M|3,M|3,        M|3,2,M|3,2,            // 20
        M|3,M|3,M|3,M|3,        M|3,M|3,M|3,M|3,        // 28
        2,2,2,2,                2,2,2,2,                // 30
        M|4,2,M|T|E|5,2,        2,2,2,2,                // 38
        M|3,M|3,M|3,M|3,        M|3,M|3,M|3,M|3,        // 40
        M|3,M|3,M|3,M|3,        M|3,M|3,M|3,M|3,        // 48
        M|3,M|3,M|3,M|3,        M|3,M|3,M|3,M|3,        // 50
        M|3,M|3,M|3,M|3,        M|3,M|3,M|3,M|3,        // 58
        M|3,M|3,M|3,M|3,        M|3,M|3,M|3,M|3,        // 60
        M|3,M|3,M|3,M|3,        M|3,M|3,M|3,M|3,        // 68
        M|T|E|4,M|T|E|4,M|T|E|4,M|T|E|4, M|3,M|3,M|3,2, // 70
        2,2,2,2,                M|3,M|3,M|3,M|3,        // 78
        W|T|4,W|T|4,W|T|4,W|T|4, W|T|4,W|T|4,W|T|4,W|T|4, // 80
        W|T|4,W|T|4,W|T|4,W|T|4, W|T|4,W|T|4,W|T|4,W|T|4, // 88
        M|3,M|3,M|3,M|3, M|3,M|3,M|3,M|3,       // 90
        M|3,M|3,M|3,M|3, M|3,M|3,M|3,M|3,       // 98
        2,2,2,M|3,      M|T|E|4,M|3,2,2,        // A0
        2,2,2,M|3,      M|T|E|4,M|3,M|3,M|3,    // A8
        M|E|3,M|3,M|3,M|3, M|3,M|3,M|3,M|3,     // B0
        M|3,2,M|T|E|4,M|3, M|3,M|3,M|3,M|3,     // B8
        M|3,M|3,M|T|E|4,M|3, M|T|E|4,M|T|E|4,M|T|E|4,M|3,       // C0
        2,2,2,2,        2,2,2,2,                // C8
        M|3,M|3,M|3,M|3, M|3,M|3,M|3,M|3,       // D0
        M|3,M|3,M|3,M|3, M|3,M|3,M|3,M|3,       // D8
        M|3,M|3,M|3,M|3, M|3,M|3,M|3,M|3,       // E0
        M|3,M|3,M|3,M|3, M|3,M|3,M|3,M|3,       // E8
        M|3,M|3,M|3,M|3, M|3,M|3,M|3,M|3,       // F0
        M|3,M|3,M|3,M|3, M|3,M|3,M|3,2          // F8
};

/*************************************************
 * Allocate register temporaries
 */

code *REGSAVE::save(code *c, int reg, unsigned *pidx)
{
    unsigned i;
    if (reg >= XMM0)
    {
        alignment = 16;
        idx = (idx + 15) & ~15;
        i = idx;
        idx += 16;
        // MOVD idx[RBP],xmm
        c = genc1(c,0xF20F11,modregxrm(2, reg - XMM0, BPRM),FLregsave,(targ_uns) i);
    }
    else
    {
        if (!alignment)
            alignment = REGSIZE;
        i = idx;
        idx += REGSIZE;
        // MOV idx[RBP],reg
        c = genc1(c,0x89,modregxrm(2, reg, BPRM),FLregsave,(targ_uns) i);
        if (I64)
            code_orrex(c, REX_W);
    }
    reflocal = TRUE;
    if (idx > top)
        top = idx;              // keep high water mark
    *pidx = i;
    return c;
}

code *REGSAVE::restore(code *c, int reg, unsigned idx)
{
    if (reg >= XMM0)
    {
        assert(alignment == 16);
        // MOVD xmm,idx[RBP]
        c = genc1(c,0xF20F10,modregxrm(2, reg - XMM0, BPRM),FLregsave,(targ_uns) idx);
    }
    else
    {   // MOV reg,idx[RBP]
        c = genc1(c,0x8B,modregxrm(2, reg, BPRM),FLregsave,(targ_uns) idx);
        if (I64)
            code_orrex(c, REX_W);
    }
    return c;
}

/************************************
 * Size for vex encoded instruction.
 */

unsigned char vex_inssize(code *c)
{
    assert(c->Iflags & CFvex);
    unsigned char ins;
    if (c->Iflags & CFvex3)
    {
        switch (c->Ivex.mmmm)
        {
        case 0: // no prefix
        case 1: // 0F
            ins = inssize2[c->Ivex.op] + 2;
            break;
        case 2: // 0F 38
            ins = inssize2[0x38] + 1;
            break;
        case 3: // 0F 3A
            ins = inssize2[0x3A] + 1;
            break;
        default:
            assert(0);
        }
    }
    else
    {
        ins = inssize2[c->Ivex.op] + 1;
    }
    return ins;
}

/************************************
 * Determine if there is a modregrm byte for code.
 */

int cod3_EA(code *c)
{   unsigned ins;

    unsigned op1 = c->Iop & 0xFF;
    if (op1 == ESCAPE)
        ins = 0;
    else if ((c->Iop & 0xFFFD00) == 0x0F3800)
        ins = inssize2[(c->Iop >> 8) & 0xFF];
    else if ((c->Iop & 0xFF00) == 0x0F00)
        ins = inssize2[op1];
    else
        ins = inssize[op1];
    return ins & M;
}

/********************************
 * Fix global variables for 386.
 */

void cod3_set32()
{
    inssize[0xA0] = T|5;
    inssize[0xA1] = T|5;
    inssize[0xA2] = T|5;
    inssize[0xA3] = T|5;
    BPRM = 5;                       /* [EBP] addressing mode        */
    fregsaved = mBP | mBX | mSI | mDI;      // saved across function calls
    FLOATREGS = FLOATREGS_32;
    FLOATREGS2 = FLOATREGS2_32;
    DOUBLEREGS = DOUBLEREGS_32;
    if (config.flags3 & CFG3eseqds)
        fregsaved |= mES;

    for (unsigned i = 0x80; i < 0x90; i++)
        inssize2[i] = W|T|6;
}

/********************************
 * Fix global variables for I64.
 */

void cod3_set64()
{
    inssize[0xA0] = T|5;                // MOV AL,mem
    inssize[0xA1] = T|5;                // MOV RAX,mem
    inssize[0xA2] = T|5;                // MOV mem,AL
    inssize[0xA3] = T|5;                // MOV mem,RAX
    BPRM = 5;                           // [RBP] addressing mode

    fregsaved = mBP | mBX | mR12 | mR13 | mR14 | mR15 | mES;      // saved across function calls
    FLOATREGS = FLOATREGS_64;
    FLOATREGS2 = FLOATREGS2_64;
    DOUBLEREGS = DOUBLEREGS_64;
    STACKALIGN = 16;

#if TARGET_LINUX || TARGET_OSX || TARGET_FREEBSD || TARGET_OPENBSD || TARGET_SOLARIS
    ALLREGS = mAX|mBX|mCX|mDX|mSI|mDI|  mR8|mR9|mR10|mR11|mR12|mR13|mR14|mR15;
    BYTEREGS = ALLREGS;
#endif

    for (unsigned i = 0x80; i < 0x90; i++)
        inssize2[i] = W|T|6;
}

/*********************************
 * Word or dword align start of function.
 */

void cod3_align()
{
    static unsigned char nops[7] = { 0x90,0x90,0x90,0x90,0x90,0x90,0x90 };
    unsigned nbytes;
#if OMFOBJ
    if (config.flags4 & CFG4speed)      // if optimized for speed
    {
        // Pick alignment based on CPU target
        if (config.target_cpu == TARGET_80486 ||
            config.target_cpu >= TARGET_PentiumPro)
        {   // 486 does reads on 16 byte boundaries, so if we are near
            // such a boundary, align us to it

            nbytes = -Coffset & 15;
            if (nbytes < 8)
            {
                Coffset += obj_bytes(cseg,Coffset,nbytes,nops); // XCHG AX,AX
            }
        }
    }
#else
    nbytes = -Coffset & 3;
    //dbg_printf("cod3_align Coffset %x nbytes %d\n",Coffset,nbytes);
    obj_bytes(cseg,Coffset,nbytes,nops);
#endif
}

/*****************************
 * Given a type, return a mask of
 * registers to hold that type.
 * Input:
 *      tyf     function type
 */

regm_t regmask(tym_t tym, tym_t tyf)
{
    switch (tybasic(tym))
    {
        case TYvoid:
        case TYstruct:
            return 0;
        case TYbool:
        case TYwchar_t:
        case TYchar16:
        case TYchar:
        case TYschar:
        case TYuchar:
        case TYshort:
        case TYushort:
        case TYint:
        case TYuint:
#if JHANDLE
        case TYjhandle:
#endif
        case TYnullptr:
        case TYnptr:
#if TARGET_SEGMENTED
        case TYsptr:
        case TYcptr:
#endif
            return mAX;

        case TYfloat:
        case TYifloat:
            if (I64)
                return mXMM0;
            if (config.exe & EX_flat)
                return mST0;
        case TYlong:
        case TYulong:
        case TYdchar:
            if (!I16)
                return mAX;
#if TARGET_SEGMENTED
        case TYfptr:
        case TYhptr:
#endif
            return mDX | mAX;

        case TYcent:
        case TYucent:
            assert(I64);
            return mDX | mAX;

#if TARGET_SEGMENTED
        case TYvptr:
            return mDX | mBX;
#endif

        case TYdouble:
        case TYdouble_alias:
        case TYidouble:
            if (I64)
                return mXMM0;
            if (config.exe & EX_flat)
                return mST0;
            return DOUBLEREGS;

        case TYllong:
        case TYullong:
            return I64 ? mAX : (I32 ? mDX | mAX : DOUBLEREGS);

        case TYldouble:
        case TYildouble:
            return mST0;

        case TYcfloat:
#if TARGET_LINUX || TARGET_OSX || TARGET_FREEBSD || TARGET_OPENBSD || TARGET_SOLARIS
            if (I32 && tybasic(tyf) == TYnfunc)
                return mDX | mAX;
#endif
        case TYcdouble:
            if (I64)
                return mXMM0 | mXMM1;
        case TYcldouble:
            return mST01;

        // SIMD vector types
        case TYfloat4:
        case TYdouble2:
        case TYschar16:
        case TYuchar16:
        case TYshort8:
        case TYushort8:
        case TYlong4:
        case TYulong4:
        case TYllong2:
        case TYullong2:
            if (!config.fpxmmregs)
            {   printf("SIMD operations not supported on this platform\n");
                exit(1);
            }
            return mXMM0;

        default:
#if DEBUG
            WRTYxx(tym);
#endif
            assert(0);
            return 0;
    }
}

/*******************************
 * Generate block exit code
 */
void outblkexitcode(block *bl, code*& c, int& anyspill, const char* sflsave, symbol** retsym, const regm_t mfuncregsave)
{
    elem *e = bl->Belem;
    block *nextb;
    block *bs1,*bs2;
    regm_t retregs = 0;
    bool jcond;

    switch (bl->BC)                     /* block exit condition         */
    {
        case BCiftrue:
            jcond = TRUE;
            bs1 = list_block(bl->Bsucc);
            bs2 = list_block(list_next(bl->Bsucc));
            if (bs1 == bl->Bnext)
            {   // Swap bs1 and bs2
                block *btmp;

                jcond ^= 1;
                btmp = bs1;
                bs1 = bs2;
                bs2 = btmp;
            }
            c = cat(c,logexp(e,jcond,FLblock,(code *) bs1));
            nextb = bs2;
            bl->Bcode = NULL;
        L2:
            if (nextb != bl->Bnext)
            {   if (configv.addlinenumbers && bl->Bsrcpos.Slinnum &&
                    !(funcsym_p->ty() & mTYnaked))
                    cgen_linnum(&c,bl->Bsrcpos);
                assert(!(bl->Bflags & BFLepilog));
                c = cat(c,genjmp(CNIL,JMP,FLblock,nextb));
            }
            bl->Bcode = cat(bl->Bcode,c);
            break;
        case BCjmptab:
        case BCifthen:
        case BCswitch:
            assert(!(bl->Bflags & BFLepilog));
            doswitch(bl);               /* hide messy details           */
            bl->Bcode = cat(c,bl->Bcode);
            break;
#if MARS
        case BCjcatch:
            // Mark all registers as destroyed. This will prevent
            // register assignments to variables used in catch blocks.
            c = cat(c,getregs((I32 | I64) ? allregs : (ALLREGS | mES)));
#if 0 && TARGET_LINUX
            if (config.flags3 & CFG3pic && !(allregs & mBX))
            {
                c = cat(c, cod3_load_got());
            }
#endif
            goto case_goto;
#endif
#if SCPP
        case BCcatch:
            // Mark all registers as destroyed. This will prevent
            // register assignments to variables used in catch blocks.
            c = cat(c,getregs(allregs | mES));
#if 0 && TARGET_LINUX
            if (config.flags3 & CFG3pic && !(allregs & mBX))
            {
                c = cat(c, cod3_load_got());
            }
#endif
            goto case_goto;

        case BCtry:
            usednteh |= EHtry;
            if (config.flags2 & CFG2seh)
                usednteh |= NTEHtry;
            goto case_goto;
#endif
        case BCgoto:
            nextb = list_block(bl->Bsucc);
            if ((funcsym_p->Sfunc->Fflags3 & Fnteh ||
                 (MARS /*&& config.flags2 & CFG2seh*/)) &&
                bl->Btry != nextb->Btry &&
                nextb->BC != BC_finally)
            {   int toindex;
                int fromindex;

                bl->Bcode = NULL;
                c = gencodelem(c,e,&retregs,TRUE);
                toindex = nextb->Btry ? nextb->Btry->Bscope_index : -1;
                assert(bl->Btry);
                fromindex = bl->Btry->Bscope_index;
#if MARS
                if (toindex + 1 == fromindex)
                {   // Simply call __finally
                    if (bl->Btry &&
                        list_block(list_next(bl->Btry->Bsucc))->BC == BCjcatch)
                    {
                        goto L2;
                    }
                }
#endif
                if (config.flags2 & CFG2seh)
                    c = cat(c,nteh_unwind(0,toindex));
#if MARS && (TARGET_LINUX || TARGET_OSX || TARGET_FREEBSD || TARGET_OPENBSD || TARGET_SOLARIS)
                else if (toindex + 1 <= fromindex)
                {
                    //c = cat(c, linux_unwind(0, toindex));
                    block *bt;

                    //printf("B%d: fromindex = %d, toindex = %d\n", bl->Bdfoidx, fromindex, toindex);
                    bt = bl;
                    while ((bt = bt->Btry) != NULL && bt->Bscope_index != toindex)
                    {   block *bf;

                        //printf("\tbt->Bscope_index = %d, bt->Blast_index = %d\n", bt->Bscope_index, bt->Blast_index);
                        bf = list_block(list_next(bt->Bsucc));
                        // Only look at try-finally blocks
                        if (bf->BC == BCjcatch)
                            continue;

                        if (bf == nextb)
                            continue;
                        //printf("\tbf = B%d, nextb = B%d\n", bf->Bdfoidx, nextb->Bdfoidx);
                        if (nextb->BC == BCgoto &&
                            !nextb->Belem &&
                            bf == list_block(nextb->Bsucc))
                            continue;

                        // call __finally
                        code *cs;
                        code *cr;
                        int nalign = 0;

                        gensaverestore(retregs,&cs,&cr);
                        if (STACKALIGN == 16)
                        {   int npush = (numbitsset(retregs) + 1) * REGSIZE;
                            if (npush & (STACKALIGN - 1))
                            {   nalign = STACKALIGN - (npush & (STACKALIGN - 1));
                                cs = genc2(cs,0x81,modregrm(3,5,SP),nalign); // SUB ESP,nalign
                                if (I64)
                                    code_orrex(cs, REX_W);
                            }
                        }
                        cs = genc(cs,0xE8,0,0,0,FLblock,(targ_size_t)list_block(bf->Bsucc));
                        if (nalign)
                        {   cs = genc2(cs,0x81,modregrm(3,0,SP),nalign); // ADD ESP,nalign
                            if (I64)
                                code_orrex(cs, REX_W);
                        }
                        c = cat3(c,cs,cr);
                    }
                }
#endif
                goto L2;
            }
        case_goto:
            c = gencodelem(c,e,&retregs,TRUE);
            if (anyspill)
            {   // Add in the epilog code
                code *cstore = NULL;
                code *cload = NULL;

                for (int i = 0; i < anyspill; i++)
                {   symbol *s = globsym.tab[i];

                    if (s->Sflags & SFLspill &&
                        vec_testbit(dfoidx,s->Srange))
                    {
                        s->Sfl = sflsave[i];    // undo block register assignments
                        cgreg_spillreg_epilog(bl,s,&cstore,&cload);
                    }
                }
                c = cat3(c,cstore,cload);
            }

        L3:
            bl->Bcode = NULL;
            nextb = list_block(bl->Bsucc);
            goto L2;

        case BC_try:
            if (config.flags2 & CFG2seh)
            {   usednteh |= NTEH_try;
                nteh_usevars();
            }
            else
                usednteh |= EHtry;
            goto case_goto;

        case BC_finally:
            // Mark all registers as destroyed. This will prevent
            // register assignments to variables used in finally blocks.
            assert(!getregs(allregs));
            assert(!e);
            assert(!bl->Bcode);
#if TARGET_LINUX || TARGET_OSX || TARGET_FREEBSD || TARGET_OPENBSD || TARGET_SOLARIS
            if (config.flags3 & CFG3pic)
            {
                int nalign = 0;
                if (STACKALIGN == 16)
                {   nalign = STACKALIGN - REGSIZE;
                    c = genc2(c,0x81,modregrm(3,5,SP),nalign); // SUB ESP,nalign
                    if (I64)
                        code_orrex(c, REX_W);
                }
                // CALL bl->Bsucc
                c = genc(c,0xE8,0,0,0,FLblock,(targ_size_t)list_block(bl->Bsucc));
                if (nalign)
                {   c = genc2(c,0x81,modregrm(3,0,SP),nalign); // ADD ESP,nalign
                    if (I64)
                        code_orrex(c, REX_W);
                }
                // JMP list_next(bl->Bsucc)
                nextb = list_block(list_next(bl->Bsucc));
                goto L2;
            }
            else
#endif
            {
                // Generate a PUSH of the address of the successor to the
                // corresponding BC_ret
                //assert(list_block(list_next(bl->Bsucc))->BC == BC_ret);
                // PUSH &succ
                c = genc(c,0x68,0,0,0,FLblock,(targ_size_t)list_block(list_next(bl->Bsucc)));
                nextb = list_block(bl->Bsucc);
                goto L2;
            }

        case BC_ret:
            c = gencodelem(c,e,&retregs,TRUE);
            bl->Bcode = gen1(c,0xC3);   // RET
            break;

#if NTEXCEPTIONS
        case BC_except:
            assert(!e);
            usednteh |= NTEH_except;
            c = cat(c,nteh_setsp(0x8B));
            getregs(allregs);
            goto L3;

        case BC_filter:
            c = cat(c,nteh_filter(bl));
            // Mark all registers as destroyed. This will prevent
            // register assignments to variables used in filter blocks.
            getregs(allregs);
            retregs = regmask(e->Ety, TYnfunc);
            c = gencodelem(c,e,&retregs,TRUE);
            bl->Bcode = gen1(c,0xC3);   // RET
            break;
#endif

        case BCretexp:
            retregs = regmask(e->Ety, funcsym_p->ty());

            // For the final load into the return regs, don't set regcon.used,
            // so that the optimizer can potentially use retregs for register
            // variable assignments.

            if (config.flags4 & CFG4optimized)
            {   regm_t usedsave;

                c = cat(c,docommas(&e));
                usedsave = regcon.used;
                if (EOP(e))
                    c = gencodelem(c,e,&retregs,TRUE);
                else
                {
                    if (e->Eoper == OPconst)
                        regcon.mvar = 0;
                    c = gencodelem(c,e,&retregs,TRUE);
                    regcon.used = usedsave;
                    if (e->Eoper == OPvar)
                    {   symbol *s = e->EV.sp.Vsym;

                        if (s->Sfl == FLreg && s->Sregm != mAX)
                            *retsym = s;
                    }
                }
            }
            else
            {
        case BCret:
        case BCexit:
                c = gencodelem(c,e,&retregs,TRUE);
            }
            bl->Bcode = c;
            if (retregs == mST0)
            {   assert(stackused == 1);
                pop87();                // account for return value
            }
            else if (retregs == mST01)
            {   assert(stackused == 2);
                pop87();
                pop87();                // account for return value
            }
            if (bl->BC == BCexit && config.flags4 & CFG4optimized)
                mfuncreg = mfuncregsave;
            if (MARS || usednteh & NTEH_try)
            {   block *bt;

                bt = bl;
                while ((bt = bt->Btry) != NULL)
                {   block *bf;

                    bf = list_block(list_next(bt->Bsucc));
#if MARS
                    // Only look at try-finally blocks
                    if (bf->BC == BCjcatch)
                    {
                        continue;
                    }
#endif
                    if (config.flags2 & CFG2seh)
                    {
                        if (bt->Bscope_index == 0)
                        {
                            // call __finally
                            code *cs;
                            code *cr;

                            c = cat(c,nteh_gensindex(-1));
                            gensaverestore(retregs,&cs,&cr);
                            cs = genc(cs,0xE8,0,0,0,FLblock,(targ_size_t)list_block(bf->Bsucc));
                            bl->Bcode = cat3(c,cs,cr);
                        }
                        else
                            bl->Bcode = cat(c,nteh_unwind(retregs,~0));
                        break;
                    }
                    else
                    {
                        // call __finally
                        code *cs;
                        code *cr;
                        int nalign = 0;

                        gensaverestore(retregs,&cs,&cr);
                        if (STACKALIGN == 16)
                        {   int npush = (numbitsset(retregs) + 1) * REGSIZE;
                            if (npush & (STACKALIGN - 1))
                            {   nalign = STACKALIGN - (npush & (STACKALIGN - 1));
                                cs = genc2(cs,0x81,modregrm(3,5,SP),nalign); // SUB ESP,nalign
                                if (I64)
                                    code_orrex(cs, REX_W);
                            }
                        }
                        // CALL bf->Bsucc
                        cs = genc(cs,0xE8,0,0,0,FLblock,(targ_size_t)list_block(bf->Bsucc));
                        if (nalign)
                        {   cs = genc2(cs,0x81,modregrm(3,0,SP),nalign); // ADD ESP,nalign
                            if (I64)
                                code_orrex(cs, REX_W);
                        }
                        bl->Bcode = c = cat3(c,cs,cr);
                    }
                }
            }
            break;

#if SCPP || MARS
        case BCasm:
            assert(!e);
            // Mark destroyed registers
            assert(!c);
            c = cat(c,getregs(iasm_regs(bl)));
            if (bl->Bsucc)
            {   nextb = list_block(bl->Bsucc);
                if (!bl->Bnext)
                    goto L2;
                if (nextb != bl->Bnext &&
                    bl->Bnext &&
                    !(bl->Bnext->BC == BCgoto &&
                     !bl->Bnext->Belem &&
                     nextb == list_block(bl->Bnext->Bsucc)))
                {   code *cl;

                    // See if already have JMP at end of block
                    cl = code_last(bl->Bcode);
                    if (!cl || cl->Iop != JMP)
                        goto L2;        // add JMP at end of block
                }
            }
            break;
#endif
        default:
#ifdef DEBUG
            printf("bl->BC = %d\n",bl->BC);
#endif
            assert(0);
    }
}

/*******************************
 * Generate code for blocks ending in a switch statement.
 * Take BCswitch and decide on
 *      BCifthen        use if - then code
 *      BCjmptab        index into jump table
 *      BCswitch        search table for match
 */

void doswitch(block *b)
{   code *cc,*c,*ce;
    regm_t retregs;
    unsigned ncases,n,reg,reg2,rm;
    targ_llong vmax,vmin,val;
    targ_llong *p;
    list_t bl;
    elem *e;

    tym_t tys;
    int sz;
    unsigned char dword;
    unsigned char mswsame;
#if LONGLONG
    targ_ulong msw;
#else
    unsigned msw;
#endif

    e = b->Belem;
    elem_debug(e);
    cc = docommas(&e);
    cgstate.stackclean++;
    tys = tybasic(e->Ety);
    sz = tysize[tys];
    dword = (sz == 2 * REGSIZE);
    mswsame = 1;                        // assume all msw's are the same
    p = b->BS.Bswitch;                  /* pointer to case data         */
    assert(p);
    ncases = *p++;                      /* number of cases              */

    vmax = MINLL;                       // smallest possible llong
    vmin = MAXLL;                       // largest possible llong
    for (n = 0; n < ncases; n++)        // find max and min case values
    {   val = *p++;
        if (val > vmax) vmax = val;
        if (val < vmin) vmin = val;
        if (REGSIZE == 2)
        {
            unsigned short ms = (val >> 16) & 0xFFFF;
            if (n == 0)
                msw = ms;
            else if (msw != ms)
                mswsame = 0;
        }
        else // REGSIZE == 4
        {
            targ_ulong ms = (val >> 32) & 0xFFFFFFFF;
            if (n == 0)
                msw = ms;
            else if (msw != ms)
                mswsame = 0;
        }
    }
    p -= ncases;
    //dbg_printf("vmax = x%lx, vmin = x%lx, vmax-vmin = x%lx\n",vmax,vmin,vmax - vmin);

    if (I64)
    {   // For now, just generate basic if-then sequence to get us running
        retregs = ALLREGS;
        b->BC = BCifthen;
        c = scodelem(e,&retregs,0,TRUE);
        assert(!dword);                 // 128 bit switches not supported
        reg = findreg(retregs);         // reg that result is in
        bl = b->Bsucc;
        for (n = 0; n < ncases; n++)
        {   code *cx;
            val = *p;
            if (sz == 4)
                cx = genc2(CNIL,0x81,modregrmx(3,7,reg),val);  // CMP reg,val
            else if (sz == 8)
            {
                if (val == (int)val)    // if val is a 64 bit value sign-extended from 32 bits
                {
                    cx = genc2(CNIL,0x81,modregrmx(3,7,reg),val);  // CMP reg,value32
                    cx->Irex |= REX_W;  // 64 bit operand
                }
                else
                {   unsigned sreg;
                                                                // MOV sreg,value64
                    cx = regwithvalue(CNIL, ALLREGS & ~mask[reg], val, &sreg, 64);
                    cx = genregs(cx,0x3B,reg,sreg);             // CMP reg,sreg
                    code_orrex(cx, REX_W);
                }
            }
            else
                assert(0);
            bl = list_next(bl);
            genjmp(cx,JE,FLblock,list_block(bl));       // JE caseaddr
            c = cat(c,cx);
            p++;
        }
        if (list_block(b->Bsucc) != b->Bnext) /* if default is not next block */
                c = cat(c,genjmp(CNIL,JMP,FLblock,list_block(b->Bsucc)));
        ce = NULL;
    }
    // Need to do research on MACHOBJ to see about better methods
    else if (MACHOBJ || ncases <= 3)
    {   // generate if-then sequence
        retregs = ALLREGS;
    L1:
        b->BC = BCifthen;
        c = scodelem(e,&retregs,0,TRUE);
        if (dword)
        {   reg = findreglsw(retregs);
            reg2 = findregmsw(retregs);
        }
        else
            reg = findreg(retregs);     /* reg that result is in        */
        bl = b->Bsucc;
        if (dword && mswsame)
        {   /* CMP reg2,MSW     */
            c = genc2(c,0x81,modregrm(3,7,reg2),msw);
            genjmp(c,JNE,FLblock,list_block(b->Bsucc)); /* JNE default  */
        }
        for (n = 0; n < ncases; n++)
        {   code *cnext = CNIL;
                                        /* CMP reg,casevalue            */
            c = cat(c,ce = genc2(CNIL,0x81,modregrm(3,7,reg),(targ_int)*p));
            if (dword && !mswsame)
            {
                cnext = gennop(CNIL);
                genjmp(ce,JNE,FLcode,(block *) cnext);
                genc2(ce,0x81,modregrm(3,7,reg2),MSREG(*p));
            }
            bl = list_next(bl);
                                        /* JE caseaddr                  */
            genjmp(ce,JE,FLblock,list_block(bl));
            c = cat(c,cnext);
            p++;
        }
        if (list_block(b->Bsucc) != b->Bnext) /* if default is not next block */
                c = cat(c,genjmp(CNIL,JMP,FLblock,list_block(b->Bsucc)));
        ce = NULL;
    }
#if TARGET_WINDOS               // try and find relocation to support this
    else if ((targ_ullong)(vmax - vmin) <= ncases * 2)  // then use jump table
    {   int modify;

        b->BC = BCjmptab;
        retregs = IDXREGS;
        if (dword)
            retregs |= mMSW;
        modify = (vmin || !I32);
        c = scodelem(e,&retregs,0,!modify);
        reg = findreg(retregs & IDXREGS); /* reg that result is in      */
        if (dword)
            reg2 = findregmsw(retregs);
        if (modify)
        {
            assert(!(retregs & regcon.mvar));
            c = cat(c,getregs(retregs));
        }
        if (vmin)                       /* if there is a minimum        */
        {
            c = genc2(c,0x81,modregrm(3,5,reg),vmin); /* SUB reg,vmin   */
            if (dword)
            {   genc2(c,0x81,modregrm(3,3,reg2),MSREG(vmin)); // SBB reg2,vmin
                genjmp(c,JNE,FLblock,list_block(b->Bsucc)); /* JNE default  */
            }
        }
        else if (dword)
        {   c = gentstreg(c,reg2);              // TEST reg2,reg2
            genjmp(c,JNE,FLblock,list_block(b->Bsucc)); /* JNE default  */
        }
        if (vmax - vmin != REGMASK)     /* if there is a maximum        */
        {                               /* CMP reg,vmax-vmin            */
            c = genc2(c,0x81,modregrm(3,7,reg),vmax-vmin);
            genjmp(c,JA,FLblock,list_block(b->Bsucc));  /* JA default   */
        }
        if (!I32)
            c = gen2(c,0xD1,modregrm(3,4,reg)); /* SHL reg,1            */
        if (I32)
        {
            ce = genc1(CNIL,0xFF,modregrm(0,4,4),FLswitch,0); /* JMP [CS:]disp[idxreg*4] */
            ce->Isib = modregrm(2,reg,5);
        }
        else
        {   rm = getaddrmode(retregs) | modregrm(0,4,0);
            ce = genc1(CNIL,0xFF,rm,FLswitch,0);        /* JMP [CS:]disp[idxreg] */
        }
        int flags = (config.flags & CFGromable) ? CFcs : 0; // table is in code seg
        ce->Iflags |= flags;                    // segment override
        ce->IEV1.Vswitch = b;
        b->Btablesize = (int) (vmax - vmin + 1) * tysize[TYnptr];
    }
#endif
    else                                /* else use switch table (BCswitch) */
    {   targ_size_t disp;
        int mod;
        code *esw;
        code *ct;

        retregs = mAX;                  /* SCASW requires AX            */
        if (dword)
            retregs |= mDX;
        else if (ncases <= 6 || config.flags4 & CFG4speed)
            goto L1;
        c = scodelem(e,&retregs,0,TRUE);
        if (dword && mswsame)
        {   /* CMP DX,MSW       */
            c = genc2(c,0x81,modregrm(3,7,DX),msw);
            genjmp(c,JNE,FLblock,list_block(b->Bsucc)); /* JNE default  */
        }
        ce = getregs(mCX|mDI);
#if TARGET_LINUX || TARGET_OSX || TARGET_FREEBSD || TARGET_OPENBSD || TARGET_SOLARIS
        if (config.flags3 & CFG3pic)
        {   // Add in GOT
            code *cx;
            code *cgot;

            ce = cat(ce, getregs(mDX));
            cx = genc2(NULL,CALL,0,0);  //     CALL L1
            gen1(cx, 0x58 + DI);        // L1: POP EDI

                                        //     ADD EDI,_GLOBAL_OFFSET_TABLE_+3
            symbol *gotsym = elfobj_getGOTsym();
            cgot = gencs(CNIL,0x81,modregrm(3,0,DI),FLextern,gotsym);
            cgot->Iflags = CFoff;
            cgot->IEVoffset2 = 3;

            makeitextern(gotsym);

            genmovreg(cgot, DX, DI);    // MOV EDX, EDI
                                        // ADD EDI,offset of switch table
            esw = gencs(CNIL,0x81,modregrm(3,0,DI),FLswitch,NULL);
            esw->IEV2.Vswitch = b;
            esw = cat3(cx, cgot, esw);
        }
        else
#endif
        {
                                        // MOV DI,offset of switch table
            esw = gencs(CNIL,0xC7,modregrm(3,0,DI),FLswitch,NULL);
            esw->IEV2.Vswitch = b;
        }
        ce = cat(ce,esw);
        movregconst(ce,CX,ncases,0);    /* MOV CX,ncases                */

        /* The switch table will be accessed through ES:DI.
         * Therefore, load ES with proper segment value.
         */
        if (config.flags3 & CFG3eseqds)
        {   assert(!(config.flags & CFGromable));
            ce = cat(ce,getregs(mCX));          // allocate CX
        }
        else
        {
            ce = cat(ce,getregs(mES|mCX));      // allocate ES and CX
            gen1(ce,(config.flags & CFGromable) ? 0x0E : 0x1E); // PUSH CS/DS
            gen1(ce,0x07);                      // POP  ES
        }

        disp = (ncases - 1) * intsize;          /* displacement to jump table */
        if (dword && !mswsame)
        {   code *cloop;

            /* Build the following:
                L1:     SCASW
                        JNE     L2
                        CMP     DX,[CS:]disp[DI]
                L2:     LOOPNE  L1
             */

            mod = (disp > 127) ? 2 : 1;         /* displacement size    */
            cloop = genc2(CNIL,0xE0,0,-7 - mod -
                ((config.flags & CFGromable) ? 1 : 0)); /* LOOPNE scasw */
            ce = gen1(ce,0xAF);                         /* SCASW        */
            code_orflag(ce,CFtarg2);                    // target of jump
            genjmp(ce,JNE,FLcode,(block *) cloop);      /* JNE loop     */
                                                /* CMP DX,[CS:]disp[DI] */
            ct = genc1(CNIL,0x39,modregrm(mod,DX,5),FLconst,disp);
            int flags = (config.flags & CFGromable) ? CFcs : 0; // table is in code seg
            ct->Iflags |= flags;                // possible seg override
            ce = cat3(ce,ct,cloop);
            disp += ncases * intsize;           /* skip over msw table  */
        }
        else
        {
            ce = gen1(ce,0xF2);         /* REPNE                        */
            gen1(ce,0xAF);              /* SCASW                        */
        }
        genjmp(ce,JNE,FLblock,list_block(b->Bsucc)); /* JNE default     */
        mod = (disp > 127) ? 2 : 1;     /* 1 or 2 byte displacement     */
        if (config.flags & CFGromable)
                gen1(ce,SEGCS);         /* table is in code segment     */
#if TARGET_LINUX || TARGET_OSX || TARGET_FREEBSD || TARGET_OPENBSD || TARGET_SOLARIS
        if (config.flags3 & CFG3pic)
        {                               // ADD EDX,(ncases-1)*2[EDI]
            ct = genc1(CNIL,0x03,modregrm(mod,DX,7),FLconst,disp);
                                        // JMP EDX
            gen2(ct,0xFF,modregrm(3,4,DX));
        }
        else
#endif
        {                               // JMP (ncases-1)*2[DI]
            ct = genc1(CNIL,0xFF,modregrm(mod,4,(I32 ? 7 : 5)),FLconst,disp);
            int flags = (config.flags & CFGromable) ? CFcs : 0; // table is in code seg
            ct->Iflags |= flags;
        }
        ce = cat(ce,ct);
        b->Btablesize = disp + intsize + ncases * tysize[TYnptr];
    }
    b->Bcode = cat3(cc,c,ce);
    //assert(b->Bcode);
    cgstate.stackclean--;
}

/******************************
 * Output data block for a jump table (BCjmptab).
 * The 'holes' in the table get filled with the
 * default label.
 */

void outjmptab(block *b)
{
  unsigned ncases,n;
  targ_llong u,vmin,vmax,val,*p;
  targ_size_t alignbytes,def,targ,*poffset;
  int jmpseg;

  poffset = (config.flags & CFGromable) ? &Coffset : &JMPOFF;
  p = b->BS.Bswitch;                    /* pointer to case data         */
  ncases = *p++;                        /* number of cases              */
  vmax = MINLL;                 // smallest possible llong
  vmin = MAXLL;                 // largest possible llong
  for (n = 0; n < ncases; n++)          /* find min case value          */
  {     val = p[n];
        if (val > vmax) vmax = val;
        if (val < vmin) vmin = val;
  }
  jmpseg = (config.flags & CFGromable) ? cseg : JMPSEG;

  /* Any alignment bytes necessary */
  alignbytes = align(0,*poffset) - *poffset;
  obj_lidata(jmpseg,*poffset,alignbytes);

  def = list_block(b->Bsucc)->Boffset;  /* default address              */
  assert(vmin <= vmax);
  for (u = vmin; ; u++)
  {     targ = def;                     /* default                      */
        for (n = 0; n < ncases; n++)
        {       if (p[n] == u)
                {       targ = list_block(list_nth(b->Bsucc,n + 1))->Boffset;
                        break;
                }
        }
        reftocodseg(jmpseg,*poffset,targ);
        *poffset += tysize[TYnptr];
        if (u == vmax)                  /* for case that (vmax == ~0)   */
                break;
  }
}

/******************************
 * Output data block for a switch table.
 * Two consecutive tables, the first is the case value table, the
 * second is the address table.
 */

void outswitab(block *b)
{ unsigned ncases,n;
  targ_llong *p;
  targ_size_t val;
  targ_size_t alignbytes,*poffset;
  int seg;                              /* target segment for table     */
  list_t bl;
  unsigned sz;
  targ_size_t offset;

  //printf("outswitab()\n");
  p = b->BS.Bswitch;                    /* pointer to case data         */
  ncases = *p++;                        /* number of cases              */

  if (config.flags & CFGromable)
  {     poffset = &Coffset;
        assert(cseg == CODE);
        seg = cseg;
  }
  else
  {
        poffset = &JMPOFF;
        seg = JMPSEG;
  }
  offset = *poffset;
  alignbytes = align(0,*poffset) - *poffset;
  obj_lidata(seg,*poffset,alignbytes);  /* any alignment bytes necessary */
  assert(*poffset == offset + alignbytes);

  sz = intsize;
  for (n = 0; n < ncases; n++)          /* send out value table         */
  {
        //printf("\tcase %d, offset = x%x\n", n, *poffset);
#if OMFOBJ
        *poffset +=
#endif
            obj_bytes(seg,*poffset,sz,p);
        p++;
  }
  offset += alignbytes + sz * ncases;
  assert(*poffset == offset);

  if (b->Btablesize == ncases * (REGSIZE * 2 + tysize[TYnptr]))
  {
        /* Send out MSW table   */
        p -= ncases;
        for (n = 0; n < ncases; n++)
        {   val = MSREG(*p);
            p++;
#if OMFOBJ
            *poffset +=
#endif
                obj_bytes(seg,*poffset,REGSIZE,&val);
        }
        offset += REGSIZE * ncases;
        assert(*poffset == offset);
  }

  bl = b->Bsucc;
  for (n = 0; n < ncases; n++)          /* send out address table       */
  {     bl = list_next(bl);
        reftocodseg(seg,*poffset,list_block(bl)->Boffset);
        *poffset += tysize[TYnptr];
  }
  assert(*poffset == offset + ncases * tysize[TYnptr]);
}

/*****************************
 * Return a jump opcode relevant to the elem for a JMP TRUE.
 */

int jmpopcode(elem *e)
{ tym_t tym;
  int zero,i,jp,op;
  static const char jops[][2][6] =
    {   /* <=  >   <   >=  ==  !=    <=0 >0  <0  >=0 ==0 !=0    */
       { {JLE,JG ,JL ,JGE,JE ,JNE},{JLE,JG ,JS ,JNS,JE ,JNE} }, /* signed   */
       { {JBE,JA ,JB ,JAE,JE ,JNE},{JE ,JNE,JB ,JAE,JE ,JNE} }, /* unsigned */
#if 0
       { {JLE,JG ,JL ,JGE,JE ,JNE},{JLE,JG ,JL ,JGE,JE ,JNE} }, /* real     */
       { {JBE,JA ,JB ,JAE,JE ,JNE},{JBE,JA ,JB ,JAE,JE ,JNE} }, /* 8087     */
       { {JA ,JBE,JAE,JB ,JE ,JNE},{JBE,JA ,JB ,JAE,JE ,JNE} }, /* 8087 R   */
#endif
    };

#define XP      (JP  << 8)
#define XNP     (JNP << 8)
    static const unsigned jfops[1][26] =
    /*   le     gt lt     ge  eqeq    ne     unord lg  leg  ule ul uge  */
    {
      { XNP|JBE,JA,XNP|JB,JAE,XNP|JE, XP|JNE,JP,   JNE,JNP, JBE,JC,XP|JAE,

    /*  ug    ue ngt nge nlt    nle    ord nlg nleg nule nul nuge    nug     nue */
        XP|JA,JE,JBE,JB, XP|JAE,XP|JA, JNP,JE, JP,  JA,  JNC,XNP|JB, XNP|JBE,JNE        }, /* 8087     */
    };

  assert(e);
  while (e->Eoper == OPcomma ||
        /* The !EOP(e->E1) is to line up with the case in cdeq() where  */
        /* we decide if mPSW is passed on when evaluating E2 or not.    */
         (e->Eoper == OPeq && !EOP(e->E1)))
        e = e->E2;                      /* right operand determines it  */

  op = e->Eoper;
  if (e->Ecount != e->Ecomsub)          // comsubs just get Z bit set
        return JNE;
  if (!OTrel(op))                       // not relational operator
  {
        tym_t tymx = tybasic(e->Ety);
        if (tyfloating(tymx) && config.inline8087 &&
            (tymx == TYldouble || tymx == TYildouble || tymx == TYcldouble ||
             tymx == TYcdouble || tymx == TYcfloat ||
             op == OPind))
        {
            return XP|JNE;
        }
        return (op >= OPbt && op <= OPbts) ? JC : JNE;
  }

  if (e->E2->Eoper == OPconst)
        zero = !boolres(e->E2);
  else
        zero = 0;

  tym = e->E1->Ety;
  if (tyfloating(tym))
#if 1
  {     i = 0;
        if (config.inline8087)
        {   i = 1;

#if 1
#define NOSAHF (I64 || config.fpxmmregs)
            if (rel_exception(op) || config.flags4 & CFG4fastfloat)
            {
                if (zero)
                {
                    if (NOSAHF)
                        op = swaprel(op);
                }
                else if (NOSAHF)
                    op = swaprel(op);
                else if (cmporder87(e->E2))
                    op = swaprel(op);
                else
                    ;
            }
            else
            {
                if (zero && config.target_cpu < TARGET_80386)
                    ;
                else
                    op = swaprel(op);
            }
#else
            if (zero && !rel_exception(op) && config.target_cpu >= TARGET_80386)
                op = swaprel(op);
            else if (!zero &&
                (cmporder87(e->E2) || !(rel_exception(op) || config.flags4 & CFG4fastfloat)))
                /* compare is reversed */
                op = swaprel(op);
#endif
        }
        jp = jfops[0][op - OPle];
        goto L1;
  }
#else
        i = (config.inline8087) ? (3 + cmporder87(e->E2)) : 2;
#endif
  else if (tyuns(tym) || tyuns(e->E2->Ety))
        i = 1;
  else if (tyintegral(tym) || typtr(tym))
        i = 0;
  else
  {
#if DEBUG
        elem_print(e);
        WRTYxx(tym);
#endif
        assert(0);
  }

  jp = jops[i][zero][op - OPle];        /* table starts with OPle       */
L1:
#if DEBUG
  if ((jp & 0xF0) != 0x70)
        WROP(op),
        printf("i %d zero %d op x%x jp x%x\n",i,zero,op,jp);
#endif
  assert((jp & 0xF0) == 0x70);
  return jp;
}

/**********************************
 * Append code to *pc which validates pointer described by
 * addressing mode in *pcs. Modify addressing mode in *pcs.
 * Input:
 *      keepmsk mask of registers we must not destroy or use
 *              if (keepmsk & RMstore), this will be only a store operation
 *              into the lvalue
 */

void cod3_ptrchk(code **pc,code *pcs,regm_t keepmsk)
{   code *c;
    code *cs2;
    unsigned char rm,sib;
    unsigned reg;
    unsigned flagsave;
    unsigned opsave;
    regm_t idxregs;
    regm_t tosave;
    regm_t used;
    int i;

    assert(!I64);
    if (!I16 && pcs->Iflags & (CFes | CFss | CFcs | CFds | CFfs | CFgs))
        return;         // not designed to deal with 48 bit far pointers

    c = *pc;

    rm = pcs->Irm;
    assert(!(rm & 0x40));       // no disp8 or reg addressing modes

    // If the addressing mode is already a register
    reg = rm & 7;
    if (I16)
    {   static const unsigned char imode[8] = { BP,BP,BP,BP,SI,DI,BP,BX };

        reg = imode[reg];               // convert [SI] to SI, etc.
    }
    idxregs = mask[reg];
    if ((rm & 0x80 && (pcs->IFL1 != FLoffset || pcs->IEV1.Vuns)) ||
        !(idxregs & ALLREGS)
       )
    {
        // Load the offset into a register, so we can push the address
        idxregs = (I16 ? IDXREGS : ALLREGS) & ~keepmsk; // only these can be index regs
        assert(idxregs);
        c = cat(c,allocreg(&idxregs,&reg,TYoffset));

        opsave = pcs->Iop;
        flagsave = pcs->Iflags;
        pcs->Iop = 0x8D;
        pcs->Irm |= modregrm(0,reg,0);
        pcs->Iflags &= ~(CFopsize | CFss | CFes | CFcs);        // no prefix bytes needed
        c = gen(c,pcs);                 // LEA reg,EA

        pcs->Iflags = flagsave;
        pcs->Iop = opsave;
    }

    // registers destroyed by the function call
    //used = (mBP | ALLREGS | mES) & ~fregsaved;
    used = 0;                           // much less code generated this way

    cs2 = CNIL;
    tosave = used & (keepmsk | idxregs);
    for (i = 0; tosave; i++)
    {   regm_t mi = mask[i];

        assert(i < REGMAX);
        if (mi & tosave)        /* i = register to save                 */
        {
            int push,pop;

            stackchanged = 1;
            if (i == ES)
            {   push = 0x06;
                pop = 0x07;
            }
            else
            {   push = 0x50 + i;
                pop = push | 8;
            }
            c = gen1(c,push);                   // PUSH i
            cs2 = cat(gen1(CNIL,pop),cs2);      // POP i
            tosave &= ~mi;
        }
    }

    // For 16 bit models, push a far pointer
    if (I16)
    {   int segreg;

        switch (pcs->Iflags & (CFes | CFss | CFcs | CFds | CFfs | CFgs))
        {   case CFes:  segreg = 0x06;  break;
            case CFss:  segreg = 0x16;  break;
            case CFcs:  segreg = 0x0E;  break;
            case 0:     segreg = 0x1E;  break;  // DS
            default:
                assert(0);
        }

        // See if we should default to SS:
        // (Happens when BP is part of the addressing mode)
        if (segreg == 0x1E && (rm & 0xC0) != 0xC0 &&
            rm & 2 && (rm & 7) != 7)
        {   segreg = 0x16;
            if (config.wflags & WFssneds)
                pcs->Iflags |= CFss;    // because BP won't be there anymore
        }
        c = gen1(c,segreg);             // PUSH segreg
    }

    c = gen1(c,0x50 + reg);             // PUSH reg

    // Rewrite the addressing mode in *pcs so it is just 0[reg]
    setaddrmode(pcs, idxregs);
    pcs->IFL1 = FLoffset;
    pcs->IEV1.Vuns = 0;

    // Call the validation function
    {
        makeitextern(rtlsym[RTLSYM_PTRCHK]);

        used &= ~(keepmsk | idxregs);           // regs destroyed by this exercise
        c = cat(c,getregs(used));
                                                // CALL __ptrchk
        gencs(c,(LARGECODE) ? 0x9A : CALL,0,FLfunc,rtlsym[RTLSYM_PTRCHK]);
    }

    *pc = cat(c,cs2);
}


/***********************************
 * Determine if BP can be used as a general purpose register.
 * Note parallels between this routine and prolog().
 * Returns:
 *      0       can't be used, needed for frame
 *      mBP     can be used
 */

regm_t cod3_useBP()
{
    tym_t tym;
    tym_t tyf;

    // Note that DOSX memory model cannot use EBP as a general purpose
    // register, as SS != DS.
    if (!(config.exe & EX_flat) || config.flags & (CFGalwaysframe | CFGnoebp))
        goto Lcant;

    if (anyiasm)
        goto Lcant;

    tyf = funcsym_p->ty();
    if (tyf & mTYnaked)                 // if no prolog/epilog for function
        goto Lcant;

    if (funcsym_p->Sfunc->Fflags3 & Ffakeeh)
    {
        goto Lcant;                     // need consistent stack frame
    }

    tym = tybasic(tyf);
    if (tym == TYifunc)
        goto Lcant;

    stackoffsets(0);
    localsize = Aoffset;                // an estimate only
//    if (localsize)
    {
        if (!(config.flags4 & CFG4speed) ||
            config.target_cpu < TARGET_Pentium ||
            tyfarfunc(tym) ||
            config.flags & CFGstack ||
            localsize >= 0x100 ||       // arbitrary value < 0x1000
            (usednteh & ~NTEHjmonitor) ||
            usedalloca
           )
            goto Lcant;
    }
Lcan:
    return mBP;

Lcant:
    return 0;
}

/***************************************
 * Gen code for OPframeptr
 */

code *cdframeptr(elem *e, regm_t *pretregs)
{
    unsigned reg;
    code cs;

    regm_t retregs = *pretregs & allregs;
    if  (!retregs)
        retregs = allregs;
    code *cg = allocreg(&retregs, &reg, TYint);

    cs.Iop = ESCAPE | ESCframeptr;
    cs.Iflags = 0;
    cs.Irex = 0;
    cs.Irm = reg;
    cg = gen(cg,&cs);

    return cat(cg,fixresult(e,retregs,pretregs));
}

/***************************************
 * Gen code for load of _GLOBAL_OFFSET_TABLE_.
 * This value gets cached in the local variable 'localgot'.
 */

code *cdgot(elem *e, regm_t *pretregs)
{
#if TARGET_OSX
    regm_t retregs;
    unsigned reg;
    code *c;

    retregs = *pretregs & allregs;
    if  (!retregs)
        retregs = allregs;
    c = allocreg(&retregs, &reg, TYnptr);

    c = genc(c,CALL,0,0,0,FLgot,0);     //     CALL L1
    gen1(c, 0x58 + reg);                // L1: POP reg

    return cat(c,fixresult(e,retregs,pretregs));
#elif TARGET_LINUX || TARGET_FREEBSD || TARGET_OPENBSD || TARGET_SOLARIS
    regm_t retregs;
    unsigned reg;
    code *c;
    code *cgot;

    retregs = *pretregs & allregs;
    if  (!retregs)
        retregs = allregs;
    c = allocreg(&retregs, &reg, TYnptr);

    c = genc2(c,CALL,0,0);      //     CALL L1
    gen1(c, 0x58 + reg);        // L1: POP reg

                                //     ADD reg,_GLOBAL_OFFSET_TABLE_+3
    symbol *gotsym = elfobj_getGOTsym();
    cgot = gencs(CNIL,0x81,modregrm(3,0,reg),FLextern,gotsym);
    /* Because the 2:3 offset from L1: is hardcoded,
     * this sequence of instructions must not
     * have any instructions in between,
     * so set CFvolatile to prevent the scheduler from rearranging it.
     */
    cgot->Iflags = CFoff | CFvolatile;
    cgot->IEVoffset2 = (reg == AX) ? 2 : 3;

    makeitextern(gotsym);
    return cat3(c,cgot,fixresult(e,retregs,pretregs));
#else
    assert(0);
    return NULL;
#endif
}

/**************************************************
 * Load contents of localgot into EBX.
 */

code *load_localgot()
{
#if TARGET_LINUX || TARGET_FREEBSD || TARGET_OPENBSD || TARGET_SOLARIS
    if (config.flags3 & CFG3pic && I32)
    {
        if (localgot)
        {
            localgot->Sflags &= ~GTregcand;     // because this hack doesn't work with reg allocator
            elem *e = el_var(localgot);
            regm_t retregs = mBX;
            code *c = codelem(e,&retregs,FALSE);
            el_free(e);
            return c;
        }
        else
        {
            elem *e = el_long(TYnptr, 0);
            e->Eoper = OPgot;
            regm_t retregs = mBX;
            code *c = codelem(e,&retregs,FALSE);
            el_free(e);
            return c;
        }
    }
#endif
    return NULL;
}

#if TARGET_LINUX || TARGET_OSX || TARGET_FREEBSD || TARGET_OPENBSD || TARGET_SOLARIS
/*****************************
 * Returns:
 *      # of bytes stored
 */

#define ONS_OHD 4               // max # of extra bytes added by obj_namestring()

STATIC int obj_namestring(char *p,const char *name)
{   unsigned len;

    len = strlen(name);
    if (len > 255)
    {
        short *ps = (short *)p;
        p[0] = 0xFF;
        p[1] = 0;
        ps[1] = len;
        memcpy(p + 4,name,len);
        len += ONS_OHD;
    }
    else
    {   p[0] = len;
        memcpy(p + 1,name,len);
        len++;
    }
    return len;
}
#endif

code *genregs(code *c,unsigned op,unsigned dstreg,unsigned srcreg)
{ return gen2(c,op,modregxrmx(3,dstreg,srcreg)); }

code *gentstreg(code *c,unsigned t)
{
    c = gen2(c,0x85,modregxrmx(3,t,t));   // TEST t,t
    code_orflag(c,CFpsw);
    return c;
}

code *genpush(code *c, unsigned reg)
{
    c = gen1(c, 0x50 + (reg & 7));
    if (reg & 8)
        code_orrex(c, REX_B);
    return c;
}

code *genpop(code *c, unsigned reg)
{
    c = gen1(c, 0x58 + (reg & 7));
    if (reg & 8)
        code_orrex(c, REX_B);
    return c;
}

/**************************
 * Generate a MOV to save a register to a stack slot
 */
code *gensavereg(unsigned& reg, targ_uns slot)
{
    // MOV i[BP],reg
    unsigned op = 0x89;              // normal mov
    if (reg == ES)
    {   reg = 0;            // the real reg number
        op = 0x8C;          // segment reg mov
    }
    code *c = genc1(NULL,op,modregxrm(2, reg, BPRM),FLcs,slot);
    if (I64)
        code_orrex(c, REX_W);

    return c;
}

/**************************
 * Generate a MOV to,from register instruction.
 * Smart enough to dump redundant register moves, and segment
 * register moves.
 */

code *genmovreg(code *c,unsigned to,unsigned from)
{
#if DEBUG
        if (to > ES || from > ES)
                printf("genmovreg(c = %p, to = %d, from = %d)\n",c,to,from);
#endif
        assert(to <= ES && from <= ES);
        if (to != from)
        {
                if (to == ES)
                        c = genregs(c,0x8E,0,from);
                else if (from == ES)
                        c = genregs(c,0x8C,0,to);
                else
                        c = genregs(c,0x89,from,to);
                if (I64)
                        code_orrex(c, REX_W);
        }
        return c;
}

/***************************************
 * Generate immediate multiply instruction for r1=r2*imm.
 * Optimize it into LEA's if we can.
 */

code *genmulimm(code *c,unsigned r1,unsigned r2,targ_int imm)
{   code cs;

    // These optimizations should probably be put into pinholeopt()
    switch (imm)
    {   case 1:
            c = genmovreg(c,r1,r2);
            break;
        case 5:
            cs.Iop = LEA;
            cs.Iflags = 0;
            cs.Irex = 0;
            buildEA(&cs,r2,r2,4,0);
            cs.orReg(r1);
            c = gen(c,&cs);
            break;
        default:
            c = genc2(c,0x69,modregxrmx(3,r1,r2),imm);    // IMUL r1,r2,imm
            break;
    }
    return c;
}

/******************************
 * Load CX with the value of _AHSHIFT.
 */

code *genshift(code *c)
{
#if SCPP && TX86
    code *c1;

    // Set up ahshift to trick ourselves into giving the right fixup,
    // which must be seg-relative, external frame, external target.
    c1 = gencs(CNIL,0xC7,modregrm(3,0,CX),FLfunc,rtlsym[RTLSYM_AHSHIFT]);
    c1->Iflags |= CFoff;
    return cat(c,c1);
#else
    assert(0);
    return 0;
#endif
}

/******************************
 * Move constant value into reg.
 * Take advantage of existing values in registers.
 * If flags & mPSW
 *      set flags based on result
 * Else if flags & 8
 *      do not disturb flags
 * Else
 *      don't care about flags
 * If flags & 1 then byte move
 * If flags & 2 then short move (for I32 and I64)
 * If flags & 4 then don't disturb unused portion of register
 * If flags & 16 then reg is a byte register AL..BH
 * If flags & 64 (0x40) then 64 bit move (I64 only)
 * Returns:
 *      code (if any) generated
 */

code *movregconst(code *c,unsigned reg,targ_size_t value,regm_t flags)
{   unsigned r;
    regm_t mreg;

    //printf("movregconst(reg=%s, value= %lld (%llx), flags=%x)\n", regm_str(mask[reg]), value, value, flags);
#define genclrreg(a,r) genregs(a,0x31,r,r)

    regm_t regm = regcon.immed.mval & mask[reg];
    targ_size_t regv = regcon.immed.value[reg];

    if (flags & 1)      // 8 bits
    {
        value &= 0xFF;
        regm &= BYTEREGS;

        // If we already have the right value in the right register
        if (regm && (regv & 0xFF) == value)
            goto L2;

        if (flags & 16 && reg & 4 &&    // if an H byte register
            regcon.immed.mval & mask[reg & 3] &&
            (((regv = regcon.immed.value[reg & 3]) >> 8) & 0xFF) == value)
            goto L2;

        /* Avoid byte register loads on Pentium Pro and Pentium II
         * to avoid dependency stalls.
         */
        if (config.flags4 & CFG4speed &&
            config.target_cpu >= TARGET_PentiumPro && !(flags & 4))
            goto L3;

        // See if another register has the right value
        r = 0;
        for (mreg = (regcon.immed.mval & BYTEREGS); mreg; mreg >>= 1)
        {
            if (mreg & 1)
            {
                if ((regcon.immed.value[r] & 0xFF) == value)
                {   c = genregs(c,0x8A,reg,r);          // MOV regL,rL
                    if (I64 && reg >= 4 || r >= 4)
                        code_orrex(c, REX);
                    goto L2;
                }
                if (!(I64 && reg >= 4) &&
                    r < 4 && ((regcon.immed.value[r] >> 8) & 0xFF) == value)
                {   c = genregs(c,0x8A,reg,r | 4);      // MOV regL,rH
                    goto L2;
                }
            }
            r++;
        }

        if (value == 0 && !(flags & 8))
        {
            if (!(flags & 4) &&                 // if we can set the whole register
                !(flags & 16 && reg & 4))       // and reg is not an H register
            {   c = genregs(c,0x31,reg,reg);    // XOR reg,reg
                regimmed_set(reg,value);
                regv = 0;
            }
            else
                c = genregs(c,0x30,reg,reg);    // XOR regL,regL
            flags &= ~mPSW;                     // flags already set by XOR
        }
        else
        {   c = genc2(c,0xC6,modregrmx(3,0,reg),value);  /* MOV regL,value */
            if (reg >= 4 && I64)
            {
                code_orrex(c, REX);
            }
        }
    L2:
        if (flags & mPSW)
            genregs(c,0x84,reg,reg);            // TEST regL,regL

        if (regm)
            // Set just the 'L' part of the register value
            regimmed_set(reg,(regv & ~(targ_size_t)0xFF) | value);
        else if (flags & 16 && reg & 4 && regcon.immed.mval & mask[reg & 3])
            // Set just the 'H' part of the register value
            regimmed_set((reg & 3),(regv & ~(targ_size_t)0xFF00) | (value << 8));
        return c;
    }
L3:
    if (I16)
        value = (targ_short) value;             /* sign-extend MSW      */
    else if (I32)
        value = (targ_int) value;

    if (!I16 && flags & 2)                      // load 16 bit value
    {
        value &= 0xFFFF;
        if (value == 0)
            goto L1;
        else
        {
            if (flags & mPSW)
                goto L1;
            code *c1 = genc2(CNIL,0xC7,modregrmx(3,0,reg),value); // MOV reg,value
            c1->Iflags |= CFopsize;             // yes, even for I64
            c = cat(c,c1);
            if (regm)
                // High bits of register are not affected by 16 bit load
                regimmed_set(reg,(regv & ~(targ_size_t)0xFFFF) | value);
        }
        return c;
    }
L1:

    /* If we already have the right value in the right register */
    if (regm && (regv & 0xFFFFFFFF) == (value & 0xFFFFFFFF) && !(flags & 64))
    {   if (flags & mPSW)
            c = gentstreg(c,reg);
    }
    else if (flags & 64 && regm && regv == value)
    {   // Look at the full 64 bits
        if (flags & mPSW)
        {
            c = gentstreg(c,reg);
            code_orrex(c, REX_W);
        }
    }
    else
    {
        if (flags & mPSW)
        {
            switch (value)
            {   case 0:
                    c = genclrreg(c,reg);
                    if (flags & 64)
                        code_orrex(c, REX_W);
                    break;
                case 1:
                    if (I64)
                        goto L4;
                    c = genclrreg(c,reg);
                    goto inc;
                case -1:
                    if (I64)
                        goto L4;
                    c = genclrreg(c,reg);
                    goto dec;
                default:
                L4:
                    if (flags & 64)
                    {
                        c = genc2(c,0xC7,(REX_W << 16) | modregrmx(3,0,reg),value); // MOV reg,value64
                        gentstreg(c,reg);
                        code_orrex(c, REX_W);
                    }
                    else
                    {   c = genc2(c,0xC7,modregrmx(3,0,reg),value); /* MOV reg,value */
                        gentstreg(c,reg);
                    }
                    break;
            }
        }
        else
        {
            /* Look for single byte conversion  */
            if (regcon.immed.mval & mAX)
            {
                if (I32)
                {   if (reg == AX && value == (targ_short) regv)
                    {   c = gen1(c,0x98);               /* CWDE         */
                        goto done;
                    }
                    if (reg == DX &&
                        value == (regcon.immed.value[AX] & 0x80000000 ? 0xFFFFFFFF : 0) &&
                        !(config.flags4 & CFG4speed && config.target_cpu >= TARGET_Pentium)
                       )
                    {   c = gen1(c,0x99);               /* CDQ          */
                        goto done;
                    }
                }
                else if (I16)
                {
                    if (reg == AX &&
                        (targ_short) value == (signed char) regv)
                    {   c = gen1(c,0x98);               /* CBW          */
                        goto done;
                    }

                    if (reg == DX &&
                        (targ_short) value == (regcon.immed.value[AX] & 0x8000 ? (targ_short) 0xFFFF : (targ_short) 0) &&
                        !(config.flags4 & CFG4speed && config.target_cpu >= TARGET_Pentium)
                       )
                    {   c = gen1(c,0x99);               /* CWD          */
                        goto done;
                    }
                }
            }
            if (value == 0 && !(flags & 8) && config.target_cpu >= TARGET_80486)
            {   c = genclrreg(c,reg);           // CLR reg
                if (flags & 64)
                    code_orrex(c, REX_W);
                goto done;
            }

            if (!I64 && regm && !(flags & 8))
            {   if (regv + 1 == value ||
                    /* Catch case of (0xFFFF+1 == 0) for 16 bit compiles */
                    (I16 && (targ_short)(regv + 1) == (targ_short)value))
                {
                inc:
                    c = gen1(c,0x40 + reg);     /* INC reg              */
                    goto done;
                }
                if (regv - 1 == value)
                {
                dec:
                    c = gen1(c,0x48 + reg);     /* DEC reg              */
                    goto done;
                }
            }

            /* See if another register has the right value      */
            r = 0;
            for (mreg = regcon.immed.mval; mreg; mreg >>= 1)
            {
#ifdef DEBUG
                assert(!I16 || regcon.immed.value[r] == (targ_short)regcon.immed.value[r]);
#endif
                if (mreg & 1 && regcon.immed.value[r] == value)
                {   c = genmovreg(c,reg,r);
                    if (flags & 64)
                        code_orrex(c, REX_W);
                    goto done;
                }
                r++;
            }

            if (value == 0 && !(flags & 8))
            {   c = genclrreg(c,reg);           // CLR reg
                if (flags & 64)
                    code_orrex(c, REX_W);
            }
            else
            {   /* See if we can just load a byte       */
                if (regm & BYTEREGS &&
                    !(config.flags4 & CFG4speed && config.target_cpu >= TARGET_PentiumPro)
                   )
                {
                    if ((regv & ~(targ_size_t)0xFF) == (value & ~(targ_size_t)0xFF))
                    {   c = movregconst(c,reg,value,(flags & 8) |4|1);  // load regL
                        return c;
                    }
                    if (regm & (mAX|mBX|mCX|mDX) &&
                        (regv & ~(targ_size_t)0xFF00) == (value & ~(targ_size_t)0xFF00) &&
                        !I64)
                    {   c = movregconst(c,4|reg,value >> 8,(flags & 8) |4|1|16); // load regH
                        return c;
                    }
                }
                if (flags & 64)
                    c = genc2(c,0xC7,(REX_W << 16) | modregrmx(3,0,reg),value); // MOV reg,value64
                else
                    c = genc2(c,0xC7,modregrmx(3,0,reg),value); // MOV reg,value
            }
        }
    done:
        regimmed_set(reg,value);
    }
    return c;
}

/**************************
 * Generate a jump instruction.
 */

code *genjmp(code *c,unsigned op,unsigned fltarg,block *targ)
{   code cs;
    code *cj;
    code *cnop;

    cs.Iop = op & 0xFF;
    cs.Iflags = 0;
    cs.Irex = 0;
    if (op != JMP && op != 0xE8)        // if not already long branch
          cs.Iflags = CFjmp16;          /* assume long branch for op = 0x7x */
    cs.IFL2 = fltarg;                   /* FLblock (or FLcode)          */
    cs.IEV2.Vblock = targ;              /* target block (or code)       */
    if (fltarg == FLcode)
        ((code *)targ)->Iflags |= CFtarg;

    if (config.flags4 & CFG4fastfloat)  // if fast floating point
        return gen(c,&cs);

    cj = gen(CNIL,&cs);
    switch (op & 0xFF00)                /* look at second jump opcode   */
    {
        /* The JP and JNP come from floating point comparisons          */
        case JP << 8:
            cs.Iop = JP;
            gen(cj,&cs);
            break;
        case JNP << 8:
            /* Do a JP around the jump instruction      */
            cnop = gennop(CNIL);
            c = genjmp(c,JP,FLcode,(block *) cnop);
            cat(cj,cnop);
            break;
        case 1 << 8:                    /* toggled no jump              */
        case 0 << 8:
            break;
        default:
#ifdef DEBUG
            printf("jop = x%x\n",op);
#endif
            assert(0);
    }
    return cat(c,cj);
}

/*******************************
 * Generate code for a function start.
 * Input:
 *      Coffset         address of start of code
 * Output:
 *      Coffset         adjusted for size of code generated
 *      EBPtoESP
 *      hasframe
 *      BPoff
 */

code *prolog()
{
    SYMIDX si;
    unsigned reg;
    char enter;
    unsigned Foffset;
    unsigned xlocalsize;     // amount to subtract from ESP to make room for locals
    unsigned pushallocreg;
    char guessneedframe;
    regm_t namedargs = 0;

    //printf("cod3.prolog(), needframe = %d, Aalign = %d\n", needframe, Aalign);
    debugx(debugw && printf("funcstart()\n"));
    regcon.immed.mval = 0;                      /* no values in registers yet   */
    EBPtoESP = -REGSIZE;
    hasframe = 0;
    char pushds = 0;
    BPoff = 0;
    code *c = CNIL;
    int pushalloc = 0;
    tym_t tyf = funcsym_p->ty();
    tym_t tym = tybasic(tyf);
    unsigned farfunc = tyfarfunc(tym);
    pushallocreg = (tyf == TYmfunc) ? CX : AX;
    if (config.flags & CFGalwaysframe || funcsym_p->Sfunc->Fflags3 & Ffakeeh)
        needframe = 1;

Lagain:
    guessneedframe = needframe;
//    if (needframe && config.exe & (EX_LINUX | EX_FREEBSD | EX_SOLARIS) && !(usednteh & ~NTEHjmonitor))
//      usednteh |= NTEHpassthru;

    /* Compute BP offsets for variables on stack.
     * The organization is:
     *  Poff    parameters
     *          seg of return addr      (if far function)
     *          IP of return addr
     *  BP->    caller's BP
     *          DS                      (if Windows prolog/epilog)
     *          exception handling context symbol
     *  Aoff    autos and regs
     *  regsave.off  any saved registers
     *  Foff    floating register
     *  AAoff   alloca temporary
     *  CSoff   common subs
     *  NDPoff  any 8087 saved registers
     *  Toff    temporaries
     *          monitor context record
     *          any saved registers
     */

    if (tym == TYifunc)
        Poff = 26;
    else if (I64)
        Poff = 16;
    else if (I32)
        Poff = farfunc ? 12 : 8;
    else
        Poff = farfunc ? 6 : 4;

    Aoff = 0;
#if NTEXCEPTIONS == 2
    Aoff -= nteh_contextsym_size();
#if MARS
    if (funcsym_p->Sfunc->Fflags3 & Ffakeeh && nteh_contextsym_size() == 0)
        Aoff -= 5 * 4;
#endif
#endif
    Aoff = -align(0,-Aoff + Aoffset);

    regsave.off = Aoff - align(0,regsave.top);
    Foffset = floatreg ? (config.fpxmmregs ? 16 : DOUBLESIZE) : 0;
    Foff = regsave.off - align(0,Foffset);
    assert(usedalloca != 1);
    AAoff = usedalloca ? (Foff - REGSIZE) : Foff;
    CSoff = AAoff - align(0,cstop * REGSIZE);
    NDPoff = CSoff - align(0,NDP::savetop * NDPSAVESIZE);
    Toff = NDPoff - align(0,Toffset);

    if (Foffset > Aalign)
        Aalign = Foffset;
    if (Aalign > REGSIZE)
    {
        // Adjust Aoff so that it is Aalign byte aligned, assuming that
        // before function parameters were pushed the stack was
        // Aalign byte aligned
        targ_size_t psize = (Poffset + (REGSIZE - 1)) & ~(REGSIZE - 1);
        int sz = psize + -Aoff + Poff + (needframe ? 0 : REGSIZE);
        if (sz & (Aalign - 1))
        {   int adj = Aalign - (sz & (Aalign - 1));
            Aoff -= adj;
            regsave.off -= adj;
            Foff -= adj;
            AAoff -= adj;
            CSoff -= adj;
            NDPoff -= adj;
            Toff -= adj;
        }
    }

    localsize = -Toff;

    regm_t topush = fregsaved & ~mfuncreg;     // mask of registers that need saving
    int npush = 0;                      // number of registers that need saving
    for (regm_t x = topush; x; x >>= 1)
        npush += x & 1;

    // Keep the stack aligned by 8 for any subsequent function calls
    if (!I16 && calledafunc &&
        (STACKALIGN == 16 || config.flags4 & CFG4stackalign))
    {
        //printf("npush = %d Poff = x%x needframe = %d localsize = x%x\n", npush, Poff, needframe, localsize);

        int sz = Poff + (needframe ? 0 : -REGSIZE) + localsize + npush * REGSIZE;
        if (STACKALIGN == 16)
        {
            if (sz & (8|4))
                localsize += STACKALIGN - (sz & (8|4));
        }
        else if (sz & 4)
            localsize += 4;
    }

    //printf("Foff x%02x Aoff x%02x Toff x%02x NDPoff x%02x CSoff x%02x Poff x%02x localsize x%02x\n",
        //(int)Foff,(int)Aoff,(int)Toff,(int)NDPoff,(int)CSoff,(int)Poff,(int)localsize);

    xlocalsize = localsize;

    if (tyf & mTYnaked)                 // if no prolog/epilog for function
    {
        hasframe = 1;
        return NULL;
    }

    if (tym == TYifunc)
    {   static unsigned char ops2[] = { 0x60,0x1E,0x06,0 };
        static unsigned char ops0[] = { 0x50,0x51,0x52,0x53,
                                        0x54,0x55,0x56,0x57,
                                        0x1E,0x06,0 };

        unsigned char *p;

        p = (config.target_cpu >= TARGET_80286) ? ops2 : ops0;
        do
            c = gen1(c,*p);
        while (*++p);
        c = genregs(c,0x8B,BP,SP);                              // MOV BP,SP
        if (localsize)
            c = genc2(c,0x81,modregrm(3,5,SP),localsize);       // SUB SP,localsize
        tyf |= mTYloadds;
        hasframe = 1;
        goto Lcont;
    }

    /* Determine if we need BP set up   */
    if (config.flags & CFGalwaysframe)
        needframe = 1;
    else
    {
        if (localsize)
        {
            if (I16 ||
                !(config.flags4 & CFG4speed) ||
                config.target_cpu < TARGET_Pentium ||
                farfunc ||
                config.flags & CFGstack ||
                xlocalsize >= 0x1000 ||
                (usednteh & ~NTEHjmonitor) ||
                anyiasm ||
                usedalloca
               )
                needframe = 1;
        }
        if (refparam && (anyiasm || I16))
            needframe = 1;
    }

    if (needframe)
    {   assert(mfuncreg & mBP);         // shouldn't have used mBP

        if (!guessneedframe)            // if guessed wrong
            goto Lagain;
    }

    if (I16 && config.wflags & WFwindows && farfunc)
    {   int wflags;
        int segreg;

#if SCPP
        // alloca() can't be because the 'special' parameter won't be at
        // a known offset from BP.
        if (usedalloca == 1)
            synerr(EM_alloca_win);      // alloca() can't be in Windows functions
#endif

        wflags = config.wflags;
        if (wflags & WFreduced && !(tyf & mTYexport))
        {   // reduced prolog/epilog for non-exported functions
            wflags &= ~(WFdgroup | WFds | WFss);
        }

        c = getregs(mAX);
        assert(!c);                     /* should not have any value in AX */

        switch (wflags & (WFdgroup | WFds | WFss))
        {   case WFdgroup:                      // MOV  AX,DGROUP
                if (wflags & WFreduced)
                    tyf &= ~mTYloadds;          // remove redundancy
                c = genc(c,0xC7,modregrm(3,0,AX),0,0,FLdatseg,(targ_uns) 0);
                c->Iflags ^= CFseg | CFoff;     // turn off CFoff, on CFseg
                break;
            case WFss:
                segreg = 2;                     // SS
                goto Lmovax;
            case WFds:
                segreg = 3;                     // DS
            Lmovax:
                c = gen2(c,0x8C,modregrm(3,segreg,AX)); // MOV AX,segreg
                if (wflags & WFds)
                    gen1(c,0x90);               // NOP
                break;
            case 0:
                break;
            default:
#ifdef DEBUG
                printf("config.wflags = x%x\n",config.wflags);
#endif
                assert(0);
        }
        if (wflags & WFincbp)
            c = gen1(c,0x40 + BP);              // INC  BP
        c = gen1(c,0x50 + BP);                  // PUSH BP
        genregs(c,0x8B,BP,SP);                  // MOV  BP,SP
        if (wflags & (WFsaveds | WFds | WFss | WFdgroup))
        {   gen1(c,0x1E);                       // PUSH DS
            pushds = TRUE;
            BPoff = -REGSIZE;
        }
        if (wflags & (WFds | WFss | WFdgroup))
            gen2(c,0x8E,modregrm(3,3,AX));      // MOV  DS,AX

        enter = FALSE;                  /* don't use ENTER instruction  */
        hasframe = 1;                   /* we have a stack frame        */
    }
    else
    if (needframe)                      // if variables or parameters
    {
        if (config.wflags & WFincbp && farfunc)
            c = gen1(c,0x40 + BP);      /* INC  BP                      */
        if (config.target_cpu < TARGET_80286 ||
            config.exe & (EX_LINUX | EX_LINUX64 | EX_OSX | EX_OSX64 | EX_FREEBSD | EX_FREEBSD64 | EX_SOLARIS | EX_SOLARIS64) ||
            !localsize ||
            config.flags & CFGstack ||
            (xlocalsize >= 0x1000 && config.exe & EX_flat) ||
            localsize >= 0x10000 ||
#if NTEXCEPTIONS == 2
            (usednteh & ~NTEHjmonitor && (config.flags2 & CFG2seh)) ||
#endif
            (config.target_cpu >= TARGET_80386 &&
             config.flags4 & CFG4speed)
           )
        {
            c = gen1(c,0x50 + BP);      // PUSH BP
            genregs(c,0x8B,BP,SP);      // MOV  BP,SP
            if (I64)
                code_orrex(c, REX_W);   // MOV RBP,RSP
#if ELFOBJ || MACHOBJ
            if (config.fulltypes)
                // Don't reorder instructions, as dwarf CFA relies on it
                code_orflag(c, CFvolatile);
#endif
            enter = FALSE;              /* do not use ENTER instruction */
#if NTEXCEPTIONS == 2
            if (usednteh & ~NTEHjmonitor && (config.flags2 & CFG2seh))
            {
                code *ce = nteh_prolog();
                c = cat(c,ce);
                int sz = nteh_contextsym_size();
                assert(sz != 0);        // should be 5*4, not 0
                xlocalsize -= sz;       // sz is already subtracted from ESP
                                        // by nteh_prolog()
            }
#endif
#if ELFOBJ || MACHOBJ
            if (config.fulltypes)
            {   int off = I64 ? 16 : 8;
                dwarf_CFA_set_loc(1);           // address after PUSH EBP
                dwarf_CFA_set_reg_offset(SP, off); // CFA is now 8[ESP]
                dwarf_CFA_offset(BP, -off);       // EBP is at 0[ESP]
                dwarf_CFA_set_loc(3);           // address after MOV EBP,ESP
                // Yes, I know the parameter is 8 when we mean 0!
                // But this gets the cfa register set to EBP correctly
                dwarf_CFA_set_reg_offset(BP, off);        // CFA is now 0[EBP]
            }
#endif
        }
        else
            enter = TRUE;
        hasframe = 1;
    }

    if (config.flags & CFGstack)        /* if stack overflow check      */
        goto Ladjstack;

    if (needframe)                      /* if variables or parameters   */
    {
        if (xlocalsize)                 /* if any stack offset          */
        {
        Ladjstack:
#if !TARGET_LINUX               // seems that Linux doesn't need to fault in stack pages
            if ((config.flags & CFGstack && !(I32 && xlocalsize < 0x1000)) // if stack overflow check
#if TARGET_WINDOS
                || (xlocalsize >= 0x1000 && config.exe & EX_flat)
#endif
               )
            {
                if (I16)
                {
                    // BUG: Won't work if parameter is passed in AX
                    c = movregconst(c,AX,xlocalsize,FALSE); // MOV AX,localsize
                    makeitextern(rtlsym[RTLSYM_CHKSTK]);
                                                            // CALL _chkstk
                    gencs(c,(LARGECODE) ? 0x9A : CALL,0,FLfunc,rtlsym[RTLSYM_CHKSTK]);
                    useregs((ALLREGS | mBP | mES) & ~rtlsym[RTLSYM_CHKSTK]->Sregsaved);
                }
                else
                {
                    /* Watch out for 64 bit code where EDX is passed as a register parameter
                     */
                    int reg = I64 ? R11 : DX;  // scratch register

                    /*      MOV     EDX, xlocalsize/0x1000
                     *  L1: SUB     ESP, 0x1000
                     *      TEST    [ESP],ESP
                     *      DEC     EDX
                     *      JNE     L1
                     *      SUB     ESP, xlocalsize % 0x1000
                     */
                    c = movregconst(c, reg, xlocalsize / 0x1000, FALSE);
                    code *csub = genc2(NULL,0x81,modregrm(3,5,SP),0x1000);
                    if (I64)
                        code_orrex(csub, REX_W);
                    code_orflag(csub, CFtarg2);
                    gen2sib(csub, 0x85, modregrm(0,SP,4),modregrm(0,4,SP));
                    if (I64)
                    {   gen2(csub, 0xFF, (REX_W << 16) | modregrmx(3,0,R11));   // DEC R11
                        genc2(csub,JNE,0,(targ_uns)-14);
                    }
                    else
                    {   gen1(csub, 0x48 + DX);                  // DEC EDX
                        genc2(csub,JNE,0,(targ_uns)-12);
                    }
                    regimmed_set(reg,0);             // reg is now 0
                    genc2(csub,0x81,modregrm(3,5,SP),xlocalsize & 0xFFF);
                    if (I64)
                        code_orrex(csub, REX_W);
                    c = cat(c,csub);
                    useregs(mask[reg]);
                }
            }
            else
#endif
            {
                if (enter)
                {   // ENTER xlocalsize,0
                    c = genc(c,0xC8,0,FLconst,xlocalsize,FLconst,(targ_uns) 0);
#if ELFOBJ || MACHOBJ
                    assert(!config.fulltypes);          // didn't emit Dwarf data
#endif
                }
                else if (xlocalsize == REGSIZE && config.flags4 & CFG4optimized)
                {   c = gen1(c,0x50 + pushallocreg);    // PUSH AX
                    // Do this to prevent an -x[EBP] to be moved in
                    // front of the push.
                    code_orflag(c,CFvolatile);
                    pushalloc = 1;
                }
                else
                {   // SUB SP,xlocalsize
                    c = genc2(c,0x81,modregrm(3,5,SP),xlocalsize);
                    if (I64)
                        code_orrex(c, REX_W);
                }
            }

            if (usedalloca)
            {
                // Set up magic parameter for alloca()
                // MOV -REGSIZE[BP],localsize - BPoff
                //c = genc(c,0xC7,modregrm(2,0,BPRM),FLconst,-REGSIZE,FLconst,localsize - BPoff);
                c = genc(c,0xC7,modregrm(2,0,BPRM),
                        FLconst,AAoff + BPoff,
                        FLconst,localsize - BPoff);
                if (I64)
                    code_orrex(c, REX_W);
            }
        }
        else
            assert(usedalloca == 0);
    }
    else if (xlocalsize)
    {
        assert(I32);

        if (xlocalsize == REGSIZE)
        {   c = gen1(c,0x50 + pushallocreg);    // PUSH AX
            pushalloc = 1;
        }
        else if (xlocalsize == 2 * REGSIZE)
        {   c = gen1(c,0x50 + pushallocreg);    // PUSH AX
            gen1(c,0x50 + pushallocreg);        // PUSH AX
            pushalloc = 1;
        }
        else
        {   // SUB ESP,xlocalsize
            c = genc2(c,0x81,modregrm(3,5,SP),xlocalsize);
            if (I64)
                code_orrex(c, REX_W);
        }
        BPoff += REGSIZE;
    }
    else
        assert((localsize | usedalloca) == 0 || (usednteh & NTEHjmonitor));
    EBPtoESP += xlocalsize;

    /*  The idea is to generate trace for all functions if -Nc is not thrown.
     *  If -Nc is thrown, generate trace only for global COMDATs, because those
     *  are relevant to the FUNCTIONS statement in the linker .DEF file.
     *  This same logic should be in epilog().
     */
    if (config.flags & CFGtrace &&
        (!(config.flags4 & CFG4allcomdat) ||
         funcsym_p->Sclass == SCcomdat ||
         funcsym_p->Sclass == SCglobal ||
         (config.flags2 & CFG2comdat && SymInline(funcsym_p))
        )
       )
    {
        if (STACKALIGN == 16 && npush)
        {   /* This could be avoided by moving the function call to after the
             * registers are saved. But I don't remember why the call is here
             * and not there.
             */
            c = genc2(c,0x81,modregrm(3,5,SP),npush * REGSIZE); // SUB ESP,npush * REGSIZE
            if (I64)
                code_orrex(c, REX_W);
        }

        symbol *s = rtlsym[farfunc ? RTLSYM_TRACE_PRO_F : RTLSYM_TRACE_PRO_N];
        makeitextern(s);
        c = gencs(c,I16 ? 0x9A : CALL,0,FLfunc,s);      // CALL _trace
        if (!I16)
            code_orflag(c,CFoff | CFselfrel);
        /* Embedding the function name inline after the call works, but it
         * makes disassembling the code annoying.
         */
#if ELFOBJ || MACHOBJ
        size_t len = strlen(funcsym_p->Sident);
        char *buffer = (char *)malloc(len + 4);
        assert(buffer);
        if (len <= 254)
        {   buffer[0] = len;
            memcpy(buffer + 1, funcsym_p->Sident, len);
            len++;
        }
        else
        {   buffer[0] = 0xFF;
            buffer[1] = 0;
            buffer[2] = len & 0xFF;
            buffer[3] = len >> 8;
            memcpy(buffer + 4, funcsym_p->Sident, len);
            len += 4;
        }
        genasm(c, buffer, len);         // append func name
        free(buffer);
#else
        char name[IDMAX+IDOHD+1];
        size_t len = obj_mangle(funcsym_p,name);
        assert(len < sizeof(name));
        genasm(c,name,len);                             // append func name
#endif
        if (STACKALIGN == 16 && npush)
        {
            c = genc2(c,0x81,modregrm(3,0,SP),npush * REGSIZE); // ADD ESP,npush * REGSIZE
            if (I64)
                code_orrex(c, REX_W);
        }
        useregs((ALLREGS | mBP | mES) & ~s->Sregsaved);
    }

#if MARS
    if (usednteh & NTEHjmonitor)
    {   Symbol *sthis;

        for (si = 0; 1; si++)
        {   assert(si < globsym.top);
            sthis = globsym.tab[si];
            if (strcmp(sthis->Sident,"this") == 0)
                break;
        }
        c = cat(c,nteh_monitor_prolog(sthis));
        EBPtoESP += 3 * 4;
    }
#endif

    while (topush)                      /* while registers to push      */
    {   reg = findreg(topush);
        topush &= ~mask[reg];
        c = gen1(c,0x50 + (reg & 7));
        if (reg & 8)
            code_orrex(c, REX_B);
        EBPtoESP += REGSIZE;
#if ELFOBJ || MACHOBJ
        if (config.fulltypes)
        {   // Emit debug_frame data giving location of saved register
            // relative to 0[EBP]
            pinholeopt(c, NULL);
            dwarf_CFA_set_loc(calcblksize(c));  // address after PUSH reg
            dwarf_CFA_offset(reg, -EBPtoESP - REGSIZE);
        }
#endif
    }

Lcont:

    /* Determine if we need to reload DS        */
    if (tyf & mTYloadds)
    {   code *c1;

        if (!pushds)                            // if not already pushed
            c = gen1(c,0x1E);                   // PUSH DS
        c1 = genc(CNIL,0xC7,modregrm(3,0,AX),0,0,FLdatseg,(targ_uns) 0); /* MOV  AX,DGROUP      */
        c1->Iflags ^= CFseg | CFoff;            /* turn off CFoff, on CFseg */
        c = cat(c,c1);
        gen2(c,0x8E,modregrm(3,3,AX));            /* MOV  DS,AX         */
        useregs(mAX);
    }

    if (tym == TYifunc)
        c = gen1(c,0xFC);                       // CLD

#if NTEXCEPTIONS == 2
    if (usednteh & NTEH_except)
        c = cat(c,nteh_setsp(0x89));            // MOV __context[EBP].esp,ESP
#endif

    // Load register parameters off of the stack. Do not use
    // assignaddr(), as it will replace the stack reference with
    // the register!
    for (si = 0; si < globsym.top; si++)
    {   symbol *s = globsym.tab[si];
        code *c2;
        unsigned sz = type_size(s->Stype);

        if ((s->Sclass == SCregpar || s->Sclass == SCparameter) &&
            s->Sfl == FLreg &&
            (refparam
#if MARS
                // This variable has been reference by a nested function
                || s->Stype->Tty & mTYvolatile
#endif
                ))
        {
            /* MOV reg,param[BP]        */
            //assert(refparam);
            if (mask[s->Sreglsw] & XMMREGS)
            {
                unsigned op = xmmload(s->Stype->Tty);  // MOVSS/D xreg,mem
                unsigned xreg = s->Sreglsw - XMM0;
                code *c2 = genc1(CNIL,op,modregxrm(2,xreg,BPRM),FLconst,Poff + s->Soffset);
                if (!hasframe)
                {   // Convert to ESP relative address rather than EBP
                    c2->Irm = modregxrm(2,xreg,4);
                    c2->Isib = modregrm(0,4,SP);
                    c2->IEVpointer1 += EBPtoESP;
                }
                c = cat(c,c2);
            }
            else
            {
                code *c2 = genc1(CNIL,0x8B ^ (sz == 1),
                    modregxrm(2,s->Sreglsw,BPRM),FLconst,Poff + s->Soffset);
                if (!I16 && sz == SHORTSIZE)
                    c2->Iflags |= CFopsize; // operand size
                if (I64 && sz >= REGSIZE)
                    c2->Irex |= REX_W;
                if (!hasframe)
                {   /* Convert to ESP relative address rather than EBP      */
                    assert(!I16);
                    c2->Irm = modregxrm(2,s->Sreglsw,4);
                    c2->Isib = modregrm(0,4,SP);
                    c2->IEVpointer1 += EBPtoESP;
                }
                if (sz > REGSIZE)
                {
                    code *c3 = genc1(CNIL,0x8B,
                        modregxrm(2,s->Sregmsw,BPRM),FLconst,Poff + s->Soffset + REGSIZE);
                    if (I64)
                        c3->Irex |= REX_W;
                    if (!hasframe)
                    {   /* Convert to ESP relative address rather than EBP  */
                        assert(!I16);
                        c3->Irm = modregxrm(2,s->Sregmsw,4);
                        c3->Isib = modregrm(0,4,SP);
                        c3->IEVpointer1 += EBPtoESP;
                    }
                    c2 = cat(c2,c3);
                }
                c = cat(c,c2);
            }
        }
        else if (s->Sclass == SCfastpar)
        {   // Argument is passed in a register
            unsigned preg = s->Spreg;

            namedargs |= mask[preg];

            if (s->Sfl == FLreg)
            {   // MOV reg,preg
                if (mask[preg] & XMMREGS)
                {
                    unsigned op = xmmload(s->Stype->Tty);      // MOVSS/D xreg,preg
                    unsigned xreg = s->Sreglsw - XMM0;
                    c = gen2(c,op,modregxrmx(3,xreg,preg - XMM0));
                }
                else
                {
                    c = genmovreg(c,s->Sreglsw,preg);
                    if (I64 && sz == 8)
                        code_orrex(c, REX_W);
                }
            }
            else if (s->Sflags & SFLdead ||
                (!anyiasm && !(s->Sflags & SFLread) && s->Sflags & SFLunambig &&
#if MARS
                 // This variable has been reference by a nested function
                 !(s->Stype->Tty & mTYvolatile) &&
#endif
                 (config.flags4 & CFG4optimized || !config.fulltypes)))
            {
                // Ignore it, as it is never referenced
                ;
            }
            else
            {
                targ_size_t offset = Aoff + BPoff + s->Soffset;
                int op = 0x89;                  // MOV x[EBP],preg
                if (preg >= XMM0 && preg <= XMM15)
                {
                    op = xmmstore(s->Stype->Tty);
                }
                if (hasframe)
                {
                    if (!(pushalloc && preg == pushallocreg))
                    {
                        // MOV x[EBP],preg
                        c2 = genc1(CNIL,op,
                            modregxrm(2,preg,BPRM),FLconst, offset);
                        if (preg >= XMM0 && preg <= XMM15)
                        {
                        }
                        else
                        {
//printf("%s Aoff = %d, BPoff = %d, Soffset = %d, sz = %d\n", s->Sident, (int)Aoff, (int)BPoff, (int)s->Soffset, (int)sz);
//                          if (offset & 2)
//                              c2->Iflags |= CFopsize;
                            if (I64 && sz == 8)
                                code_orrex(c2, REX_W);
                        }
                        c = cat(c, c2);
                    }
                }
                else
                {
                    offset += EBPtoESP;
                    if (!(pushalloc && preg == pushallocreg))
                    {
                        // MOV offset[ESP],preg
                        // BUG: byte size?
                        c2 = genc1(CNIL,op,
                            (modregrm(0,4,SP) << 8) |
                            modregxrm(2,preg,4),FLconst,offset);
                        if (preg >= XMM0 && preg <= XMM15)
                        {
                        }
                        else
                        {
                            if (I64 && sz == 8)
                                c2->Irex |= REX_W;
//                          if (offset & 2)
//                              c2->Iflags |= CFopsize;
                        }
                        c = cat(c,c2);
                    }
                }
            }
        }
    }

    /* Load arguments passed in registers into the varargs save area
     * so they can be accessed by va_arg().
     */
    if (I64 && variadic(funcsym_p->Stype))
    {
        /* Look for __va_argsave
         */
        symbol *sv = NULL;
        for (SYMIDX si = 0; si < globsym.top; si++)
        {   symbol *s = globsym.tab[si];
            if (s->Sident[0] == '_' && strcmp(s->Sident, "__va_argsave") == 0)
            {   sv = s;
                break;
            }
        }

        if (sv && !(sv->Sflags & SFLdead))
        {
        /* Generate code to move any arguments passed in registers into
         * the stack variable __va_argsave,
         * so we can reference it via pointers through va_arg().
         *   struct __va_argsave_t {
         *     size_t[6] regs;
         *     real[8] fpregs;
         *     uint offset_regs;
         *     uint offset_fpregs;
         *     void* stack_args;
         *     void* reg_args;
         *   }
         * The MOVAPS instructions seg fault if data is not aligned on
         * 16 bytes, so this gives us a nice check to ensure no mistakes.
            MOV     voff+0*8[RBP],EDI
            MOV     voff+1*8[RBP],ESI
            MOV     voff+2*8[RBP],RDX
            MOV     voff+3*8[RBP],RCX
            MOV     voff+4*8[RBP],R8
            MOV     voff+5*8[RBP],R9
            MOVZX   EAX,AL                      // AL = 0..8, # of XMM registers used
            SHL     EAX,2                       // 4 bytes for each MOVAPS
            LEA     RDX,offset L2[RIP]
            SUB     RDX,RAX
            LEA     RAX,voff+6*8+0x7F[RBP]
            JMP     EDX
            MOVAPS  -0x0F[RAX],XMM7             // only save XMM registers if actually used
            MOVAPS  -0x1F[RAX],XMM6
            MOVAPS  -0x2F[RAX],XMM5
            MOVAPS  -0x3F[RAX],XMM4
            MOVAPS  -0x4F[RAX],XMM3
            MOVAPS  -0x5F[RAX],XMM2
            MOVAPS  -0x6F[RAX],XMM1
            MOVAPS  -0x7F[RAX],XMM0
          L2:
            MOV     1[RAX],offset_regs          // set __va_argsave.offset_regs
            MOV     5[RAX],offset_fpregs        // set __va_argsave.offset_fpregs
            LEA     RDX, Poff+Poffset[RBP]
            MOV     9[RAX],RDX                  // set __va_argsave.stack_args
            SUB     RAX,6*8+0x7F                // point to start of __va_argsave
            MOV     6*8+8*16+4+4+8[RAX],RAX     // set __va_argsave.reg_args
        */
        targ_size_t voff = Aoff + BPoff + sv->Soffset;  // EBP offset of start of sv
        const int vregnum = 6;
        const unsigned vsize = vregnum * 8 + 8 * 16;
        code *cv = CNIL;

        static unsigned char regs[vregnum] = { DI,SI,DX,CX,R8,R9 };

        if (!hasframe)
            voff += EBPtoESP;
        for (int i = 0; i < vregnum; i++)
        {
            unsigned r = regs[i];
            if (!(mask[r] & namedargs))         // named args are already dealt with
            {   unsigned ea = (REX_W << 16) | modregxrm(2,r,BPRM);
                if (!hasframe)
                    ea = (REX_W << 16) | (modregrm(0,4,SP) << 8) | modregxrm(2,r,4);
                cv = genc1(cv,0x89,ea,FLconst,voff + i*8);
            }
        }

        cv = genregs(cv,0x0FB6,AX,AX);                          // MOVZX EAX,AL
        genc2(cv,0xC1,modregrm(3,4,AX),2);                      // SHL EAX,2
        int raxoff = voff+6*8+0x7F;
        unsigned L2offset = (raxoff < -0x7F) ? 0x2C : 0x29;
        if (!hasframe)
            L2offset += 1;                                      // +1 for sib byte
        // LEA RDX,offset L2[RIP]
        genc1(cv,0x8D,(REX_W << 16) | modregrm(0,DX,5),FLconst,L2offset);
        genregs(cv,0x29,AX,DX);                                 // SUB RDX,RAX
        code_orrex(cv, REX_W);
        // LEA RAX,voff+vsize-6*8-16+0x7F[RBP]
        unsigned ea = (REX_W << 16) | modregrm(2,AX,BPRM);
        if (!hasframe)
            // add sib byte for [RSP] addressing
            ea = (REX_W << 16) | (modregrm(0,4,SP) << 8) | modregxrm(2,AX,4);
        genc1(cv,0x8D,ea,FLconst,raxoff);
        gen2(cv,0xFF,modregrm(3,4,DX));                         // JMP EDX
        for (int i = 0; i < 8; i++)
        {
            // MOVAPS -15-16*i[RAX],XMM7-i
            genc1(cv,0x0F29,modregrm(0,XMM7-i,0),FLconst,-15-16*i);
        }

        /* Compute offset_regs and offset_fpregs
         */
        unsigned offset_regs = 0;
        unsigned offset_fpregs = vregnum * 8;
        for (int i = AX; i <= XMM7; i++)
        {   regm_t m = mask[i];
            if (m & namedargs)
            {
                if (m & (mDI|mSI|mDX|mCX|mR8|mR9))
                    offset_regs += 8;
                else if (m & XMMREGS)
                    offset_fpregs += 16;
                namedargs &= ~m;
                if (!namedargs)
                    break;
            }
        }
        // MOV 1[RAX],offset_regs
        genc(cv,0xC7,modregrm(2,0,AX),FLconst,1,FLconst,offset_regs);

        // MOV 5[RAX],offset_fpregs
        genc(cv,0xC7,modregrm(2,0,AX),FLconst,5,FLconst,offset_fpregs);

        // LEA RDX, Poff+Poffset[RBP]
        ea = modregrm(2,DX,BPRM);
        if (!hasframe)
            ea = (modregrm(0,4,SP) << 8) | modregrm(2,DX,4);
        Poffset = (Poffset + (REGSIZE - 1)) & ~(REGSIZE - 1);
        genc1(cv,0x8D,(REX_W << 16) | ea,FLconst,Poff + Poffset);

        // MOV 9[RAX],RDX
        genc1(cv,0x89,(REX_W << 16) | modregrm(2,DX,AX),FLconst,9);

        // SUB RAX,6*8+0x7F             // point to start of __va_argsave
        genc2(cv,0x2D,0,6*8+0x7F);
        code_orrex(cv, REX_W);

        // MOV 6*8+8*16+4+4+8[RAX],RAX  // set __va_argsave.reg_args
        genc1(cv,0x89,(REX_W << 16) | modregrm(2,AX,AX),FLconst,6*8+8*16+4+4+8);

        pinholeopt(cv, NULL);
        useregs(mDX|mAX);
        c = cat(c,cv);
        }
    }

#if 0 && TARGET_LINUX
    if (gotref)
    {                                   // position independent reference
        c = cat(c, cod3_load_got());
    }
#endif

    return c;
}

/*******************************
 * Generate and return function epilog.
 * Output:
 *      retsize         Size of function epilog
 */

static targ_size_t spoff;

void epilog(block *b)
{   code *c;
    code *cr;
    code *ce;
    code *cpopds;
    unsigned reg;
    unsigned regx;                      // register that's not a return reg
    regm_t topop,regm;
    tym_t tyf,tym;
    int op;
    char farfunc;
    targ_size_t xlocalsize = localsize;

    c = CNIL;
    ce = b->Bcode;
    tyf = funcsym_p->ty();
    tym = tybasic(tyf);
    farfunc = tyfarfunc(tym);
    if (!(b->Bflags & BFLepilog))       // if no epilog code
        goto Lret;                      // just generate RET
    regx = (b->BC == BCret) ? AX : CX;

    spoff = 0;
    retsize = 0;

    if (tyf & mTYnaked)                 // if no prolog/epilog
        return;

    if (tym == TYifunc)
    {   static unsigned char ops2[] = { 0x07,0x1F,0x61,0xCF,0 };
        static unsigned char ops0[] = { 0x07,0x1F,0x5F,0x5E,
                                        0x5D,0x5B,0x5B,0x5A,
                                        0x59,0x58,0xCF,0 };
        unsigned char *p;

        c = genregs(c,0x8B,SP,BP);              // MOV SP,BP
        p = (config.target_cpu >= TARGET_80286) ? ops2 : ops0;
        do
            gen1(c,*p);
        while (*++p);
        goto Lopt;
    }

    if (config.flags & CFGtrace &&
        (!(config.flags4 & CFG4allcomdat) ||
         funcsym_p->Sclass == SCcomdat ||
         funcsym_p->Sclass == SCglobal ||
         (config.flags2 & CFG2comdat && SymInline(funcsym_p))
        )
       )
    {
        symbol *s = rtlsym[farfunc ? RTLSYM_TRACE_EPI_F : RTLSYM_TRACE_EPI_N];
        makeitextern(s);
        c = gencs(c,I16 ? 0x9A : CALL,0,FLfunc,s);      // CALLF _trace
        if (!I16)
            code_orflag(c,CFoff | CFselfrel);
        useregs((ALLREGS | mBP | mES) & ~s->Sregsaved);
    }

    if (usednteh & ~NTEHjmonitor && (config.exe == EX_NT || MARS))
        c = cat(c,nteh_epilog());

    cpopds = CNIL;
    if (tyf & mTYloadds)
    {   cpopds = gen1(cpopds,0x1F);             // POP DS
        c = cat(c,cpopds);
        spoff += intsize;
    }

    /* Pop all the general purpose registers saved on the stack
     * by the prolog code. Remember to do them in the reverse
     * order they were pushed.
     */
    reg = I64 ? R15 : DI;
    regm = 1 << reg;
    topop = fregsaved & ~mfuncreg;
#ifdef DEBUG
    if (topop & ~0xFFFF)
        printf("fregsaved = x%x, mfuncreg = x%x\n",fregsaved,mfuncreg);
#endif
    assert(!(topop & ~0xFFFF));
    while (topop)
    {   if (topop & regm)
        {   c = gen1(c,0x58 + (reg & 7));         // POP reg
            if (reg & 8)
                code_orrex(c, REX_B);
            topop &= ~regm;
            spoff += REGSIZE;
        }
        regm >>= 1;
        reg--;
    }

#if MARS
    if (usednteh & NTEHjmonitor)
    {
        regm_t retregs = 0;
        if (b->BC == BCretexp)
            retregs = regmask(b->Belem->Ety, tym);
        code *cn = nteh_monitor_epilog(retregs);
        c = cat(c,cn);
        xlocalsize += 8;
    }
#endif

    if (config.wflags & WFwindows && farfunc)
    {
        int wflags = config.wflags;
        if (wflags & WFreduced && !(tyf & mTYexport))
        {   // reduced prolog/epilog for non-exported functions
            wflags &= ~(WFdgroup | WFds | WFss);
            if (!(wflags & WFsaveds))
                goto L4;
        }

        if (localsize | usedalloca)
        {
            c = genc1(c,0x8D,modregrm(1,SP,6),FLconst,(targ_uns)-2); /* LEA SP,-2[BP] */
        }
        if (wflags & (WFsaveds | WFds | WFss | WFdgroup))
        {   if (cpopds)
                cpopds->Iop = NOP;              // don't need previous one
            c = gen1(c,0x1F);                   // POP DS
        }
        c = gen1(c,0x58 + BP);                  // POP BP
        if (config.wflags & WFincbp)
            gen1(c,0x48 + BP);                  // DEC BP
        assert(hasframe);
    }
    else
    {
        if (needframe || (xlocalsize && hasframe))
        {
        L4:
            assert(hasframe);
            if (xlocalsize | usedalloca)
            {   if (config.target_cpu >= TARGET_80286 &&
                    !(config.target_cpu >= TARGET_80386 &&
                     config.flags4 & CFG4speed)
                   )
                    c = gen1(c,0xC9);           // LEAVE
                else if (0 && xlocalsize == REGSIZE && !usedalloca && I32)
                {   // This doesn't work - I should figure out why
                    mfuncreg &= ~mask[regx];
                    c = gen1(c,0x58 + regx);    // POP regx
                    c = gen1(c,0x58 + BP);      // POP BP
                }
                else
                {   c = genregs(c,0x8B,SP,BP);  // MOV SP,BP
                    if (I64)
                        code_orrex(c, REX_W);   // MOV RSP,RBP
                    c = gen1(c,0x58 + BP);      // POP BP
                }
            }
            else
                c = gen1(c,0x58 + BP);          // POP BP
            if (config.wflags & WFincbp && farfunc)
                gen1(c,0x48 + BP);              // DEC BP
        }
        else if (xlocalsize == REGSIZE && (!I16 || b->BC == BCret))
        {   mfuncreg &= ~mask[regx];
            c = gen1(c,0x58 + regx);                    // POP regx
        }
        else if (xlocalsize)
        {
            c = genc2(c,0x81,modregrm(3,0,SP),xlocalsize);      // ADD SP,xlocalsize
            if (I64)
                code_orrex(c, REX_W);
        }
    }
    if (b->BC == BCret || b->BC == BCretexp)
    {
Lret:
        op = tyfarfunc(tym) ? 0xCA : 0xC2;
        if (tym == TYhfunc)
        {
            c = genc2(c,0xC2,0,4);                      // RET 4
        }
        else if (!typfunc(tym) ||                       // if caller cleans the stack
                 Poffset == 0)                          // or nothing pushed on the stack anyway
        {   op++;                                       // to a regular RET
            c = gen1(c,op);
        }
        else
        {   // Stack is always aligned on register size boundary
            Poffset = (Poffset + (REGSIZE - 1)) & ~(REGSIZE - 1);
            c = genc2(c,op,0,Poffset);          // RET Poffset
        }
    }

Lopt:
    // If last instruction in ce is ADD SP,imm, and first instruction
    // in c sets SP, we can dump the ADD.
    cr = code_last(ce);
    if (cr && c && !I64)
    {
        if (cr->Iop == 0x81 && cr->Irm == modregrm(3,0,SP))     // if ADD SP,imm
        {
            if (
                c->Iop == 0xC9 ||                                  // LEAVE
                (c->Iop == 0x8B && c->Irm == modregrm(3,SP,BP)) || // MOV SP,BP
                (c->Iop == 0x8D && c->Irm == modregrm(1,SP,6))     // LEA SP,-imm[BP]
               )
                cr->Iop = NOP;
            else if (c->Iop == 0x58 + BP)                       // if POP BP
            {   cr->Iop = 0x8B;
                cr->Irm = modregrm(3,SP,BP);                    // MOV SP,BP
            }
        }
#if 0   // These optimizations don't work if the called function
        // cleans off the stack.
        else if (c->Iop == 0xC3 && cr->Iop == CALL)     // CALL near
        {   cr->Iop = 0xE9;                             // JMP near
            c->Iop = NOP;
        }
        else if (c->Iop == 0xCB && cr->Iop == 0x9A)     // CALL far
        {   cr->Iop = 0xEA;                             // JMP far
            c->Iop = NOP;
        }
#endif
    }

    retsize += calcblksize(c);          // compute size of function epilog
    b->Bcode = cat(ce,c);
}

/*******************************
 * Return offset of SP from BP.
 */

targ_size_t cod3_spoff()
{
    return spoff + localsize;
}

/**********************************
 * Load value of _GLOBAL_OFFSET_TABLE_ into EBX
 */

code *cod3_load_got()
{
#if TARGET_LINUX || TARGET_OSX || TARGET_FREEBSD || TARGET_OPENBSD || TARGET_SOLARIS
    code *c;
    code *cgot;

    c = genc2(NULL,CALL,0,0);   //     CALL L1
    gen1(c, 0x58 + BX);         // L1: POP EBX

                                //     ADD EBX,_GLOBAL_OFFSET_TABLE_+3
    symbol *gotsym = elfobj_getGOTsym();
    cgot = gencs(CNIL,0x81,0xC3,FLextern,gotsym);
    cgot->Iflags = CFoff;
    cgot->IEVoffset2 = 3;

    makeitextern(gotsym);
    return cat(c,cgot);
#else
    assert(0);
    return NULL;
#endif
}

code* gen_spill_reg(Symbol* s, bool toreg)
{
    code *c;
    code cs;
    regm_t keepmsk = toreg ? RMload : RMstore;
    int sz = type_size(s->Stype);

    elem* e = el_var(s); // so we can trick getlvalue() into working for us

    if (mask[s->Sreglsw] & XMMREGS)
    {   // Convert to save/restore of XMM register
        if (toreg)
            cs.Iop = xmmload(s->Stype->Tty);        // MOVSS/D xreg,mem
        else
            cs.Iop = xmmstore(s->Stype->Tty);       // MOVSS/D mem,xreg
        c = getlvalue(&cs,e,keepmsk);
        cs.orReg(s->Sreglsw - XMM0);
        c = gen(c,&cs);
    }
    else
    {
        cs.Iop = toreg ? 0x8B : 0x89; // MOV reg,mem[ESP] : MOV mem[ESP],reg
        cs.Iop ^= (sz == 1);
        c = getlvalue(&cs,e,keepmsk);
        cs.orReg(s->Sreglsw);
        if (I64 && sz == 1 && s->Sreglsw >= 4)
            cs.Irex |= REX;
        c = gen(c,&cs);
        if (sz > REGSIZE)
        {
            cs.setReg(s->Sregmsw);
            getlvalue_msw(&cs);
            c = gen(c,&cs);
        }
    }

    el_free(e);

    return c;
}

/****************************
 * Generate code for, and output a thunk.
 * Input:
 *      thisty  Type of this pointer
 *      p       ESP parameter offset to this pointer
 *      d       offset to add to 'this' pointer
 *      d2      offset from 'this' to vptr
 *      i       offset into vtbl[]
 */

void cod3_thunk(symbol *sthunk,symbol *sfunc,unsigned p,tym_t thisty,
        targ_size_t d,int i,targ_size_t d2)
{   code *c,*c1;
    targ_size_t thunkoffset;
    tym_t thunkty;

    cod3_align();

    /* Skip over return address */
    thunkty = tybasic(sthunk->ty());
#if TARGET_SEGMENTED
    if (tyfarfunc(thunkty))
        p += I32 ? 8 : tysize[TYfptr];          /* far function */
    else
#endif
        p += tysize[TYnptr];

    if (!I16)
    {
        /*
           Generate:
            ADD p[ESP],d
           For direct call:
            JMP sfunc
           For virtual call:
            MOV EAX, p[ESP]                     EAX = this
            MOV EAX, d2[EAX]                    EAX = this->vptr
            JMP i[EAX]                          jump to virtual function
         */
        unsigned reg = 0;
        if ((targ_ptrdiff_t)d < 0)
        {
            d = -d;
            reg = 5;                            // switch from ADD to SUB
        }
        if (thunkty == TYmfunc)
        {                                       // ADD ECX,d
            c = CNIL;
            if (d)
                c = genc2(c,0x81,modregrm(3,reg,CX),d);
        }
        else if (thunkty == TYjfunc || (I64 && thunkty == TYnfunc))
        {                                       // ADD EAX,d
            c = CNIL;
            if (d)
                c = genc2(c,0x81,modregrm(3,reg,I64 ? DI : AX),d);
        }
        else
        {
            c = genc(CNIL,0x81,modregrm(2,reg,4),
                FLconst,p,                      // to this
                FLconst,d);                     // ADD p[ESP],d
            c->Isib = modregrm(0,4,SP);
        }
        if (I64 && c)
            c->Irex |= REX_W;
    }
    else
    {
        /*
           Generate:
            MOV BX,SP
            ADD [SS:] p[BX],d
           For direct call:
            JMP sfunc
           For virtual call:
            MOV BX, p[BX]                       BX = this
            MOV BX, d2[BX]                      BX = this->vptr
            JMP i[BX]                           jump to virtual function
         */


        c = genregs(CNIL,0x89,SP,BX);                   /* MOV BX,SP    */
        c1 = genc(CNIL,0x81,modregrm(2,0,7),
            FLconst,p,                                  /* to this      */
            FLconst,d);                                 /* ADD p[BX],d  */
        if (config.wflags & WFssneds ||
            // If DS needs reloading from SS,
            // then assume SS != DS on thunk entry
            (config.wflags & WFss && LARGEDATA))
            c1->Iflags |= CFss;                         /* SS:          */
        c = cat(c,c1);
    }

    if ((i & 0xFFFF) != 0xFFFF)                 /* if virtual call      */
    {   code *c2,*c3;

#define FARTHIS (tysize(thisty) > REGSIZE)
#define FARVPTR FARTHIS

#if TARGET_SEGMENTED
        assert(thisty != TYvptr);               /* can't handle this case */
#endif

        if (!I16)
        {
            assert(!FARTHIS && !LARGECODE);
            if (thunkty == TYmfunc)     // if 'this' is in ECX
            {   c1 = CNIL;

                // MOV EAX,d2[ECX]
                c2 = genc1(CNIL,0x8B,modregrm(2,AX,CX),FLconst,d2);
            }
            else if (thunkty == TYjfunc)        // if 'this' is in EAX
            {   c1 = CNIL;

                // MOV EAX,d2[EAX]
                c2 = genc1(CNIL,0x8B,modregrm(2,AX,AX),FLconst,d2);
            }
            else
            {
                // MOV EAX,p[ESP]
                c1 = genc1(CNIL,0x8B,(modregrm(0,4,SP) << 8) | modregrm(2,AX,4),FLconst,(targ_uns) p);
                if (I64)
                    c1->Irex |= REX_W;

                // MOV EAX,d2[EAX]
                c2 = genc1(CNIL,0x8B,modregrm(2,AX,AX),FLconst,d2);
            }
            if (I64)
                code_orrex(c2, REX_W);
                                                        /* JMP i[EAX]   */
            c3 = genc1(CNIL,0xFF,modregrm(2,4,0),FLconst,(targ_uns) i);
        }
        else
        {
            /* MOV/LES BX,[SS:] p[BX]   */
            c1 = genc1(CNIL,(FARTHIS ? 0xC4 : 0x8B),modregrm(2,BX,7),FLconst,(targ_uns) p);
            if (config.wflags & WFssneds ||
                // If DS needs reloading from SS,
                // then assume SS != DS on thunk entry
                (config.wflags & WFss && LARGEDATA))
                c1->Iflags |= CFss;                     /* SS:          */

            /* MOV/LES BX,[ES:]d2[BX] */
            c2 = genc1(CNIL,(FARVPTR ? 0xC4 : 0x8B),modregrm(2,BX,7),FLconst,d2);
            if (FARTHIS)
                c2->Iflags |= CFes;                     /* ES:          */

                                                        /* JMP i[BX]    */
            c3 = genc1(CNIL,0xFF,modregrm(2,(LARGECODE ? 5 : 4),7),FLconst,(targ_uns) i);
            if (FARVPTR)
                c3->Iflags |= CFes;                     /* ES:          */
        }
        c = cat4(c,c1,c2,c3);
    }
    else
    {
        c1 = gencs(CNIL,(LARGECODE ? 0xEA : 0xE9),0,FLfunc,sfunc); /* JMP sfunc */
        c1->Iflags |= LARGECODE ? (CFseg | CFoff) : (CFselfrel | CFoff);
        c = cat(c,c1);
    }

    thunkoffset = Coffset;
    pinholeopt(c,NULL);
    codout(c);
    code_free(c);

    sthunk->Soffset = thunkoffset;
    sthunk->Ssize = Coffset - thunkoffset; /* size of thunk */
    sthunk->Sseg = cseg;
#if TARGET_LINUX || TARGET_OSX || TARGET_FREEBSD || TARGET_OPENBSD || TARGET_SOLARIS
    objpubdef(cseg,sthunk,sthunk->Soffset);
#endif
    searchfixlist(sthunk);              /* resolve forward refs */
}

/*****************************
 * Assume symbol s is extern.
 */

void makeitextern(symbol *s)
{
        if (s->Sxtrnnum == 0)
        {       s->Sclass = SCextern;           /* external             */
                /*printf("makeitextern(x%x)\n",s);*/
                objextern(s);
        }
}


/*******************************
 * Replace JMPs in Bgotocode with JMP SHORTs whereever possible.
 * This routine depends on FLcode jumps to only be forward
 * referenced.
 * BFLjmpoptdone is set to TRUE if nothing more can be done
 * with this block.
 * Input:
 *      flag    !=0 means don't have correct Boffsets yet
 * Returns:
 *      number of bytes saved
 */

int branch(block *bl,int flag)
{ int bytesaved;
  code *c,*cn,*ct;
  targ_size_t offset,disp;
  targ_size_t csize;

  if (!flag)
      bl->Bflags |= BFLjmpoptdone;      // assume this will be all
  c = bl->Bcode;
  if (!c)
        return 0;
  bytesaved = 0;
  offset = bl->Boffset;                 /* offset of start of block     */
  while (1)
  {     unsigned char op;

        csize = calccodsize(c);
        cn = code_next(c);
        op = c->Iop;
        if ((op & ~0x0F) == 0x70 && c->Iflags & CFjmp16 ||
            op == JMP)
        {
          L1:
            switch (c->IFL2)
            {
                case FLblock:
                    if (flag)           // no offsets yet, don't optimize
                        goto L3;
                    disp = c->IEV2.Vblock->Boffset - offset - csize;

                    /* If this is a forward branch, and there is an aligned
                     * block intervening, it is possible that shrinking
                     * the jump instruction will cause it to be out of
                     * range of the target. This happens if the alignment
                     * prevents the target block from moving correspondingly
                     * closer.
                     */
                    if (disp >= 0x7F-4 && c->IEV2.Vblock->Boffset > offset)
                    {   /* Look for intervening alignment
                         */
                        for (block *b = bl->Bnext; b; b = b->Bnext)
                        {
                            if (b->Balign)
                            {
                                bl->Bflags &= ~BFLjmpoptdone;   // some JMPs left
                                goto L3;
                            }
                            if (b == c->IEV2.Vblock)
                                break;
                        }
                    }

                    break;

                case FLcode:
                {   code *cr;

                    disp = 0;

                    ct = c->IEV2.Vcode;         /* target of branch     */
                    assert(ct->Iflags & (CFtarg | CFtarg2));
                    for (cr = cn; cr; cr = code_next(cr))
                    {
                        if (cr == ct)
                            break;
                        disp += calccodsize(cr);
                    }

                    if (!cr)
                    {   // Didn't find it in forward search. Try backwards jump
                        int s = 0;
                        disp = 0;
                        for (cr = bl->Bcode; cr != cn; cr = code_next(cr))
                        {
                            assert(cr != NULL); // must have found it
                            if (cr == ct)
                                s = 1;
                            if (s)
                                disp += calccodsize(cr);
                        }
                    }

                    if (config.flags4 & CFG4optimized && !flag)
                    {
                        /* Propagate branch forward past junk   */
                        while (1)
                        {   if (ct->Iop == NOP ||
                                ct->Iop == (ESCAPE | ESClinnum))
                            {   ct = code_next(ct);
                                if (!ct)
                                    goto L2;
                            }
                            else
                            {   c->IEV2.Vcode = ct;
                                ct->Iflags |= CFtarg;
                                break;
                            }
                        }

                        /* And eliminate jmps to jmps   */
                        if ((op == ct->Iop || ct->Iop == JMP) &&
                            (op == JMP || c->Iflags & CFjmp16))
                        {   c->IFL2 = ct->IFL2;
                            c->IEV2.Vcode = ct->IEV2.Vcode;
                            /*printf("eliminating branch\n");*/
                            goto L1;
                        }
                     L2: ;
                    }
                }
                    break;

                default:
                    goto L3;
            }

            if (disp == 0)                      // bra to next instruction
            {   bytesaved += csize;
                c->Iop = NOP;                   // del branch instruction
                c->IEV2.Vcode = NULL;
                c = cn;
                if (!c)
                    break;
                continue;
            }
            else if ((targ_size_t)(targ_schar)(disp - 2) == (disp - 2) &&
                     (targ_size_t)(targ_schar)disp == disp)
            {
                if (op == JMP)
                {   c->Iop = JMPS;              // JMP SHORT
                    bytesaved += I16 ? 1 : 3;
                }
                else                            // else Jcond
                {   c->Iflags &= ~CFjmp16;      // a branch is ok
                    bytesaved += I16 ? 3 : 4;

                    // Replace a cond jump around a call to a function that
                    // never returns with a cond jump to that function.
                    if (config.flags4 & CFG4optimized &&
                        config.target_cpu >= TARGET_80386 &&
                        disp == (I16 ? 3 : 5) &&
                        cn &&
                        cn->Iop == CALL &&
                        cn->IFL2 == FLfunc &&
                        cn->IEVsym2->Sflags & SFLexit &&
                        !(cn->Iflags & (CFtarg | CFtarg2))
                       )
                    {
                        cn->Iop = 0x0F00 | ((c->Iop & 0x0F) ^ 0x81);
                        c->Iop = NOP;
                        c->IEV2.Vcode = NULL;
                        bytesaved++;

                        // If nobody else points to ct, we can remove the CFtarg
                        if (flag && ct)
                        {   code *cx;

                            for (cx = bl->Bcode; 1; cx = code_next(cx))
                            {
                                if (!cx)
                                {   ct->Iflags &= ~CFtarg;
                                    break;
                                }
                                if (cx->IEV2.Vcode == ct)
                                    break;
                            }
                        }
                    }
                }
                csize = calccodsize(c);
            }
            else
                bl->Bflags &= ~BFLjmpoptdone;   // some JMPs left
        }
L3:
        if (cn)
        {   offset += csize;
            c = cn;
        }
        else
            break;
  }
  //printf("bytesaved = x%x\n",bytesaved);
  return bytesaved;
}

/************************************************
 * Adjust all Soffset's of stack variables so they
 * are all relative to the frame pointer.
 */

#if MARS

void cod3_adjSymOffsets()
{   SYMIDX si;

    //printf("cod3_adjSymOffsets()\n");
    for (si = 0; si < globsym.top; si++)
    {   //printf("globsym.tab[%d] = %p\n",si,globsym.tab[si]);
        symbol *s = globsym.tab[si];

        switch (s->Sclass)
        {
            case SCparameter:
            case SCregpar:
//printf("s = '%s', Soffset = x%x, Poff = x%x, EBPtoESP = x%x\n", s->Sident, s->Soffset, Poff, EBPtoESP);
                s->Soffset += Poff;
if (0 && !(funcsym_p->Sfunc->Fflags3 & Fmember))
{
    if (!hasframe)
        s->Soffset += EBPtoESP;
    if (funcsym_p->Sfunc->Fflags3 & Fnested)
        s->Soffset += REGSIZE;
}
                break;
            case SCauto:
            case SCfastpar:
            case SCregister:
            case_auto:
//printf("s = '%s', Soffset = x%x, Aoff = x%x, BPoff = x%x EBPtoESP = x%x\n", s->Sident, s->Soffset, Aoff, BPoff, EBPtoESP);
//              if (!(funcsym_p->Sfunc->Fflags3 & Fnested))
                    s->Soffset += Aoff + BPoff;
                break;
            case SCbprel:
                break;
            default:
                continue;
        }
#if 0
        if (!hasframe)
            s->Soffset += EBPtoESP;
#endif
    }
}

#endif

/*******************************
 * Take symbol info in union ev and replace it with a real address
 * in Vpointer.
 */

void assignaddr(block *bl)
{
    int EBPtoESPsave = EBPtoESP;
    int hasframesave = hasframe;

    if (bl->Bflags & BFLoutsideprolog)
    {   EBPtoESP = -REGSIZE;
        hasframe = 0;
    }
    assignaddrc(bl->Bcode);
    hasframe = hasframesave;
    EBPtoESP = EBPtoESPsave;
}

void assignaddrc(code *c)
{
    int sn;
    symbol *s;
    unsigned char ins,rm;
    targ_size_t soff;
    targ_size_t base;

    base = EBPtoESP;
    for (; c; c = code_next(c))
    {
#ifdef DEBUG
        if (0)
        {       printf("assignaddrc()\n");
                c->print();
        }
        if (code_next(c) && code_next(code_next(c)) == c)
            assert(0);
#endif
        if (c->Iflags & CFvex)
            ins = vex_inssize(c);
        else if ((c->Iop & 0xFFFD00) == 0x0F3800)
            ins = inssize2[(c->Iop >> 8) & 0xFF];
        else if ((c->Iop & 0xFF00) == 0x0F00)
            ins = inssize2[c->Iop & 0xFF];
        else if ((c->Iop & 0xFF) == ESCAPE)
        {
            if (c->Iop == (ESCAPE | ESCadjesp))
            {
                //printf("adjusting EBPtoESP (%d) by %ld\n",EBPtoESP,c->IEV2.Vint);
                EBPtoESP += c->IEV1.Vint;
                c->Iop = NOP;
            }
            if (c->Iop == (ESCAPE | ESCframeptr))
            {   // Convert to load of frame pointer
                // c->Irm is the register to use
                if (hasframe)
                {   // MOV reg,EBP
                    c->Iop = 0x89;
                    if (c->Irm & 8)
                        c->Irex |= REX_B;
                    c->Irm = modregrm(3,BP,c->Irm & 7);
                }
                else
                {   // LEA reg,EBPtoESP[ESP]
                    c->Iop = 0x8D;
                    if (c->Irm & 8)
                        c->Irex |= REX_R;
                    c->Irm = modregrm(2,c->Irm & 7,4);
                    c->Isib = modregrm(0,4,SP);
                    c->Iflags = CFoff;
                    c->IFL1 = FLconst;
                    c->IEV1.Vuns = EBPtoESP;
                }
            }
            if (I64)
                c->Irex |= REX_W;
            continue;
        }
        else
            ins = inssize[c->Iop & 0xFF];
        if (!(ins & M) ||
            ((rm = c->Irm) & 0xC0) == 0xC0)
            goto do2;           /* if no first operand          */
        if (is32bitaddr(I32,c->Iflags))
        {

            if (
                ((rm & 0xC0) == 0 && !((rm & 7) == 4 && (c->Isib & 7) == 5 || (rm & 7) == 5))
               )
                goto do2;       /* if no first operand  */
        }
        else
        {
            if (
                ((rm & 0xC0) == 0 && !((rm & 7) == 6))
               )
                goto do2;       /* if no first operand  */
        }
        s = c->IEVsym1;
        switch (c->IFL1)
        {
#if OMFOBJ
            case FLdata:
                if (s->Sclass == SCcomdat)
                {   c->IFL1 = FLextern;
                    goto do2;
                }
#if MARS
                c->IEVseg1 = s->Sseg;
#else
                c->IEVseg1 = DATA;
#endif
                c->IEVpointer1 += s->Soffset;
                c->IFL1 = FLdatseg;
                goto do2;
            case FLudata:
#if MARS
                c->IEVseg1 = s->Sseg;
#else
                c->IEVseg1 = UDATA;
#endif
                c->IEVpointer1 += s->Soffset;
                c->IFL1 = FLdatseg;
                goto do2;
#else                                   // don't loose symbol information
            case FLdata:
            case FLudata:
            case FLtlsdata:
                c->IFL1 = FLextern;
                goto do2;
#endif
            case FLdatseg:
                c->IEVseg1 = DATA;
                goto do2;

#if TARGET_SEGMENTED
            case FLfardata:
            case FLcsdata:
#endif
            case FLpseudo:
                goto do2;

            case FLstack:
                //printf("Soffset = %d, EBPtoESP = %d, base = %d, pointer = %d\n",
                //s->Soffset,EBPtoESP,base,c->IEVpointer1);
                c->IEVpointer1 += s->Soffset + EBPtoESP - base - EEoffset;
                break;

            case FLreg:
            case FLauto:
                soff = Aoff;
            L1:
                if (s->Sflags & SFLunambig && !(s->Sflags & SFLread) && // if never loaded
                    !anyiasm &&
                    // if not optimized, leave it in for debuggability
                    (config.flags4 & CFG4optimized || !config.fulltypes))
                {   c->Iop = NOP;               // remove references to it
                    continue;
                }
                if (s->Sfl == FLreg && c->IEVpointer1 < 2)
                {       int reg = s->Sreglsw;

                        assert(!(s->Sregm & ~mask[reg]));
                        if (c->IEVpointer1 == 1)
                        {   assert(reg < 4);    /* must be a BYTEREGS   */
                            reg |= 4;           /* convert to high byte reg */
                        }
                        if (reg & 8)
                        {   assert(I64);
                            c->Irex |= REX_B;
                            reg &= 7;
                        }
                        c->Irm = (c->Irm & modregrm(0,7,0))
                                | modregrm(3,0,reg);
                        assert(c->Iop != LES && c->Iop != LEA);
                        goto do2;
                }
                else
                {   c->IEVpointer1 += s->Soffset + soff + BPoff;
                    if (s->Sflags & SFLunambig)
                        c->Iflags |= CFunambig;
            L2:
                    if (!hasframe)
                    {   /* Convert to ESP relative address instead of EBP */
                        unsigned char rm;

                        assert(!I16);
                        c->IEVpointer1 += EBPtoESP;
                        rm = c->Irm;
                        if ((rm & 7) == 4)              // if SIB byte
                        {
                            assert((c->Isib & 7) == BP);
                            assert((rm & 0xC0) != 0);
                            c->Isib = (c->Isib & ~7) | modregrm(0,0,SP);
                        }
                        else
                        {
                            assert((rm & 7) == 5);
                            c->Irm = (rm & modregrm(0,7,0))
                                    | modregrm(2,0,4);
                            c->Isib = modregrm(0,4,SP);
                        }
                    }
                }
                break;
            case FLpara:
                soff = Poff - BPoff;    // cancel out add of BPoff
                goto L1;
            case FLtmp:
                soff = Toff;
                goto L1;
            case FLfltreg:
                c->IEVpointer1 += Foff + BPoff;
                c->Iflags |= CFunambig;
                goto L2;
            case FLallocatmp:
                c->IEVpointer1 += AAoff + BPoff;
                goto L2;
            case FLbprel:
                c->IEVpointer1 += s->Soffset;
                break;
            case FLcs:
                sn = c->IEV1.Vuns;
                if (!CSE_loaded(sn))            // if never loaded
                {       c->Iop = NOP;
                        continue;
                }
                c->IEVpointer1 = sn * REGSIZE + CSoff + BPoff;
                c->Iflags |= CFunambig;
                goto L2;
            case FLregsave:
                sn = c->IEV1.Vuns;
                c->IEVpointer1 = sn + regsave.off + BPoff;
                c->Iflags |= CFunambig;
                goto L2;
            case FLndp:
#if MARS
                assert(c->IEV1.Vuns < NDP::savetop);
#endif
                c->IEVpointer1 = c->IEV1.Vuns * NDPSAVESIZE + NDPoff + BPoff;
                c->Iflags |= CFunambig;
                goto L2;
            case FLoffset:
                break;
            case FLlocalsize:
                c->IEVpointer1 += localsize;
                break;
            case FLconst:
            default:
                goto do2;
        }
        c->IFL1 = FLconst;
    do2:
        /* Ignore TEST (F6 and F7) opcodes      */
        if (!(ins & T)) goto done;              /* if no second operand */
        s = c->IEVsym2;
        switch (c->IFL2)
        {
#if ELFOBJ || MACHOBJ
            case FLdata:
            case FLudata:
            case FLtlsdata:
                c->IFL2 = FLextern;
                goto do2;
#else
            case FLdata:
                if (s->Sclass == SCcomdat)
                {   c->IFL2 = FLextern;
                    goto do2;
                }
#if MARS
                c->IEVseg2 = s->Sseg;
#else
                c->IEVseg2 = DATA;
#endif
                c->IEVpointer2 += s->Soffset;
                c->IFL2 = FLdatseg;
                goto done;
            case FLudata:
#if MARS
                c->IEVseg2 = s->Sseg;
#else
                c->IEVseg2 = UDATA;
#endif
                c->IEVpointer2 += s->Soffset;
                c->IFL2 = FLdatseg;
                goto done;
#endif
            case FLdatseg:
                c->IEVseg2 = DATA;
                goto done;
#if TARGET_SEGMENTED
            case FLcsdata:
            case FLfardata:
                goto done;
#endif
            case FLreg:
            case FLpseudo:
                assert(0);
                /* NOTREACHED */
            case FLauto:
                c->IEVpointer2 += s->Soffset + Aoff + BPoff;
                break;
            case FLpara:
                c->IEVpointer2 += s->Soffset + Poff;
                break;
            case FLtmp:
                c->IEVpointer2 += s->Soffset + Toff + BPoff;
                break;
            case FLfltreg:
                c->IEVpointer2 += Foff + BPoff;
                break;
            case FLallocatmp:
                c->IEVpointer2 += AAoff + BPoff;
                break;
            case FLbprel:
                c->IEVpointer2 += s->Soffset;
                break;

            case FLstack:
                c->IEVpointer2 += s->Soffset + EBPtoESP - base;
                break;

            case FLcs:
            case FLndp:
            case FLregsave:
                assert(0);
                /* NOTREACHED */

            case FLconst:
                break;

            case FLlocalsize:
                c->IEVpointer2 += localsize;
                break;

            default:
                goto done;
        }
        c->IFL2 = FLconst;
  done:
        ;
    }
}

/*******************************
 * Return offset from BP of symbol s.
 */

targ_size_t cod3_bpoffset(symbol *s)
{   targ_size_t offset;

    symbol_debug(s);
    offset = s->Soffset;
    switch (s->Sfl)
    {
        case FLpara:
            offset += Poff;
            break;
        case FLauto:
            offset += Aoff + BPoff;
            break;
        case FLtmp:
            offset += Toff + BPoff;
            break;
        default:
#ifdef DEBUG
            WRFL((enum FL)s->Sfl);
            symbol_print(s);
#endif
            assert(0);
    }
    assert(hasframe);
    return offset;
}


/*******************************
 * Find shorter versions of the same instructions.
 * Does these optimizations:
 *      replaces jmps to the next instruction with NOPs
 *      sign extension of modregrm displacement
 *      sign extension of immediate data (can't do it for OR, AND, XOR
 *              as the opcodes are not defined)
 *      short versions for AX EA
 *      short versions for reg EA
 * Input:
 *      b -> block for code (or NULL)
 */

void pinholeopt(code *c,block *b)
{ targ_size_t a;
  unsigned op,mod;
  unsigned char ins;
  int usespace;
  int useopsize;
  int space;
  block *bn;

#ifdef DEBUG
    static int tested; if (!tested) { tested++; pinholeopt_unittest(); }
#endif

#if 0
  code *cstart = c;
  if (debugc)
  {
      printf("+pinholeopt(%p)\n",c);
  }
#endif

  if (b)
  {     bn = b->Bnext;
        usespace = (config.flags4 & CFG4space && b->BC != BCasm);
        useopsize = (I16 || (config.flags4 & CFG4space && b->BC != BCasm));
  }
  else
  {     bn = NULL;
        usespace = (config.flags4 & CFG4space);
        useopsize = (I16 || config.flags4 & CFG4space);
  }
  for (; c; c = code_next(c))
  {
    L1:
        op = c->Iop;
        if (c->Iflags & CFvex)
            ins = vex_inssize(c);
        else if ((op & 0xFFFD00) == 0x0F3800)
            ins = inssize2[(op >> 8) & 0xFF];
        else if ((op & 0xFF00) == 0x0F00)
            ins = inssize2[op & 0xFF];
        else
            ins = inssize[op & 0xFF];
        if (ins & M)            // if modregrm byte
        {   int shortop = (c->Iflags & CFopsize) ? !I16 : I16;
            int local_BPRM = BPRM;

            if (c->Iflags & CFaddrsize)
                local_BPRM ^= 5 ^ 6;    // toggle between 5 and 6

            unsigned rm = c->Irm;
            unsigned reg = rm & modregrm(0,7,0);          // isolate reg field
            unsigned ereg = rm & 7;
            //printf("c = %p, op = %02x rm = %02x\n", c, op, rm);

            /* If immediate second operand      */
            if ((ins & T ||
                 ((op == 0xF6 || op == 0xF7) && (reg < modregrm(0,2,0) || reg > modregrm(0,3,0)))
                ) &&
                c->IFL2 == FLconst)
            {
                int flags = c->Iflags & CFpsw;      /* if want result in flags */
                targ_long u = c->IEV2.Vuns;
                if (ins & E)
                    u = (signed char) u;
                else if (shortop)
                    u = (short) u;

                // Replace CMP reg,0 with TEST reg,reg
                if ((op & 0xFE) == 0x80 &&              // 80 is CMP R8,imm8; 81 is CMP reg,imm
                    rm >= modregrm(3,7,AX) &&
                    u == 0)
                {       c->Iop = (op & 1) | 0x84;
                        c->Irm = modregrm(3,ereg,ereg);
                        if (c->Irex & REX_B)
                            c->Irex |= REX_R;
                        goto L1;
                }

                /* Optimize ANDs with an immediate constant             */
                if ((op == 0x81 || op == 0x80) && reg == modregrm(0,4,0))
                {
                    if (rm >= modregrm(3,4,AX))         // AND reg,imm
                    {
                        if (u == 0)
                        {       /* Replace with XOR reg,reg     */
                                c->Iop = 0x30 | (op & 1);
                                c->Irm = modregrm(3,ereg,ereg);
                                if (c->Irex & REX_B)
                                    c->Irex |= REX_R;
                                goto L1;
                        }
                        if (u == 0xFFFFFFFF && !flags)
                        {       c->Iop = NOP;
                                goto L1;
                        }
                    }
                    if (op == 0x81 && !flags)
                    {   // If we can do the operation in one byte

                        // If EA is not SI or DI
                        if ((rm < modregrm(3,4,SP) || I64) &&
                            (config.flags4 & CFG4space ||
                             config.target_cpu < TARGET_PentiumPro)
                           )
                        {
                            if ((u & 0xFFFFFF00) == 0xFFFFFF00)
                                goto L2;
                            else if (rm < modregrm(3,0,0) || (!c->Irex && ereg < 4))
                            {   if (!shortop)
                                {   if ((u & 0xFFFF00FF) == 0xFFFF00FF)
                                        goto L3;
                                }
                                else
                                {
                                    if ((u & 0xFF) == 0xFF)
                                        goto L3;
                                }
                            }
                        }
                        if (!shortop && useopsize)
                        {
                            if ((u & 0xFFFF0000) == 0xFFFF0000)
                            {   c->Iflags ^= CFopsize;
                                goto L1;
                            }
                            if ((u & 0xFFFF) == 0xFFFF && rm < modregrm(3,4,AX))
                            {   c->IEVoffset1 += 2; /* address MSW      */
                                c->IEV2.Vuns >>= 16;
                                c->Iflags ^= CFopsize;
                                goto L1;
                            }
                            if (rm >= modregrm(3,4,AX))
                            {
                                if (u == 0xFF && (rm <= modregrm(3,4,BX) || I64))
                                {   c->Iop = 0x0FB6;     // MOVZX
                                    c->Irm = modregrm(3,ereg,ereg);
                                    if (c->Irex & REX_B)
                                        c->Irex |= REX_R;
                                    goto L1;
                                }
                                if (u == 0xFFFF)
                                {   c->Iop = 0x0FB7;     // MOVZX
                                    c->Irm = modregrm(3,ereg,ereg);
                                    if (c->Irex & REX_B)
                                        c->Irex |= REX_R;
                                    goto L1;
                                }
                            }
                        }
                    }
                }

                /* Look for ADD,OR,SUB,XOR with u that we can eliminate */
                if (!flags &&
                    (op == 0x81 || op == 0x80) &&
                    (reg == modregrm(0,0,0) || reg == modregrm(0,1,0) ||  // ADD,OR
                     reg == modregrm(0,5,0) || reg == modregrm(0,6,0))    // SUB, XOR
                   )
                {
                        if (u == 0)
                        {
                                c->Iop = NOP;
                                goto L1;
                        }
                        if (u == ~0 && reg == modregrm(0,6,0))  /* XOR  */
                        {
                                c->Iop = 0xF6 | (op & 1);       /* NOT  */
                                c->Irm ^= modregrm(0,6^2,0);
                                goto L1;
                        }
                        if (!shortop &&
                            useopsize &&
                            op == 0x81 &&
                            (u & 0xFFFF0000) == 0 &&
                            (reg == modregrm(0,6,0) || reg == modregrm(0,1,0)))
                        {    c->Iflags ^= CFopsize;
                             goto L1;
                        }
                }

                /* Look for TEST or OR or XOR with an immediate constant */
                /* that we can replace with a byte operation            */
                if (op == 0xF7 && reg == modregrm(0,0,0) ||
                    op == 0x81 && reg == modregrm(0,6,0) && !flags ||
                    op == 0x81 && reg == modregrm(0,1,0))
                {
                    // See if we can replace a dword with a word
                    // (avoid for 32 bit instructions, because CFopsize
                    //  is too slow)
                    if (!shortop && useopsize)
                    {   if ((u & 0xFFFF0000) == 0)
                        {   c->Iflags ^= CFopsize;
                            goto L1;
                        }
                        /* If memory (not register) addressing mode     */
                        if ((u & 0xFFFF) == 0 && rm < modregrm(3,0,AX))
                        {   c->IEVoffset1 += 2; /* address MSW  */
                            c->IEV2.Vuns >>= 16;
                            c->Iflags ^= CFopsize;
                            goto L1;
                        }
                    }

                    // If EA is not SI or DI
                    if (rm < (modregrm(3,0,SP) | reg) &&
                        (usespace ||
                         config.target_cpu < TARGET_PentiumPro)
                       )
                    {
                        if ((u & 0xFFFFFF00) == 0)
                        {
                        L2: c->Iop--;           /* to byte instruction  */
                            c->Iflags &= ~CFopsize;
                            goto L1;
                        }
                        if (((u & 0xFFFF00FF) == 0 ||
                             (shortop && (u & 0xFF) == 0)) &&
                            (rm < modregrm(3,0,0) || (!c->Irex && ereg < 4)))
                        {
                        L3:
                            c->IEV2.Vuns >>= 8;
                            if (rm >= (modregrm(3,0,AX) | reg))
                                c->Irm |= 4;    /* AX->AH, BX->BH, etc. */
                            else
                                c->IEVoffset1 += 1;
                            goto L2;
                        }
                    }
#if 0
                    // BUG: which is right?
                    else if ((u & 0xFFFF0000) == 0)
#else
                    else if (0 && op == 0xF7 &&
                             rm >= modregrm(3,0,SP) &&
                             (u & 0xFFFF0000) == 0)
#endif
                        c->Iflags &= ~CFopsize;
                }

                // Try to replace TEST reg,-1 with TEST reg,reg
                if (op == 0xF6 && rm >= modregrm(3,0,AX) && rm <= modregrm(3,0,7)) // TEST regL,immed8
                {       if ((u & 0xFF) == 0xFF)
                        {
                           L4:  c->Iop = 0x84;          // TEST regL,regL
                                c->Irm = modregrm(3,ereg,ereg);
                                if (c->Irex & REX_B)
                                    c->Irex |= REX_R;
                                c->Iflags &= ~CFopsize;
                                goto L1;
                        }
                }
                if (op == 0xF7 && rm >= modregrm(3,0,AX) && rm <= modregrm(3,0,7) && (I64 || ereg < 4))
                {       if (u == 0xFF)
                                goto L4;
                        if ((u & 0xFFFF) == 0xFF00 && shortop && !c->Irex && ereg < 4)
                        {       ereg |= 4;                /* to regH      */
                                goto L4;
                        }
                }

                /* Look for sign extended immediate data */
                if ((signed char) u == u)
                {
                    if (op == 0x81)
                    {   if (reg != 0x08 && reg != 0x20 && reg != 0x30)
                            c->Iop = op = 0x83;         /* 8 bit sgn ext */
                    }
                    else if (op == 0x69)                /* IMUL rw,ew,dw */
                        c->Iop = op = 0x6B;             /* IMUL rw,ew,db */
                }

                // Look for SHIFT EA,imm8 we can replace with short form
                if (u == 1 && ((op & 0xFE) == 0xC0))
                    c->Iop |= 0xD0;

            } /* if immediate second operand */

            /* Look for AX short form */
            if (ins & A)
            {   if (rm == modregrm(0,AX,local_BPRM) &&
                    !(c->Irex & REX_R) &&               // and it's AX, not R8
                    (op & ~3) == 0x88 &&
                    !I64)
                {       op = ((op & 3) + 0xA0) ^ 2;
                        /* 8A-> A0 */
                        /* 8B-> A1 */
                        /* 88-> A2 */
                        /* 89-> A3 */
                        c->Iop = op;
                        c->IFL2 = c->IFL1;
                        c->IEV2 = c->IEV1;
                }

                /* Replace MOV REG1,REG2 with MOV EREG1,EREG2   */
                else if (!I16 &&
                         (op == 0x89 || op == 0x8B) &&
                         (rm & 0xC0) == 0xC0 &&
                         (!b || b->BC != BCasm)
                        )
                    c->Iflags &= ~CFopsize;

                // If rm is AX
                else if ((rm & modregrm(3,0,7)) == modregrm(3,0,AX) && !(c->Irex & (REX_R | REX_B)))
                {       switch (op)
                        {   case 0x80:  op = reg | 4; break;
                            case 0x81:  op = reg | 5; break;
                            case 0x87:  op = 0x90 + (reg>>3); break;    // XCHG
                            case 0xF6:
                                if (reg == 0)
                                    op = 0xA8;  /* TEST AL,immed8       */
                                break;
                            case 0xF7:
                                if (reg == 0)
                                    op = 0xA9;  /* TEST AX,immed16      */
                                break;
                        }
                        c->Iop = op;
                }
            }

            /* Look for reg short form */
            if ((ins & R) && (rm & 0xC0) == 0xC0)
            {   switch (op)
                {   case 0xC6:  op = 0xB0 + ereg; break;
                    case 0xC7:  op = 0xB8 + ereg; break;
                    case 0xFF:
                        switch (reg)
                        {   case 6<<3: op = 0x50+ereg; break;/* PUSH*/
                            case 0<<3: if (!I64) op = 0x40+ereg; break; /* INC*/
                            case 1<<3: if (!I64) op = 0x48+ereg; break; /* DEC*/
                        }
                        break;
                    case 0x8F:  op = 0x58 + ereg; break;
                    case 0x87:
                        if (reg == 0) op = 0x90 + ereg;
                        break;
                }
                c->Iop = op;
            }

            // Look to replace SHL reg,1 with ADD reg,reg
            if ((op & ~1) == 0xD0 &&
                     (rm & modregrm(3,7,0)) == modregrm(3,4,0) &&
                     config.target_cpu >= TARGET_80486)
            {
                c->Iop &= 1;
                c->Irm = (rm & modregrm(3,0,7)) | (ereg << 3);
                if (c->Irex & REX_B)
                    c->Irex |= REX_R;
                if (!(c->Iflags & CFpsw) && !I16)
                    c->Iflags &= ~CFopsize;
                goto L1;
            }

            /* Look for sign extended modregrm displacement, or 0
             * displacement.
             */

            if (((rm & 0xC0) == 0x80) && // it's a 16/32 bit disp
                c->IFL1 == FLconst)      // and it's a constant
            {
                a = c->IEVpointer1;
                if (a == 0 && (rm & 7) != local_BPRM &&         // if 0[disp]
                    !(local_BPRM == 5 && (rm & 7) == 4 && (c->Isib & 7) == BP)
                   )
                    c->Irm &= 0x3F;
                else if (!I16)
                {
                    if ((targ_size_t)(targ_schar)a == a)
                        c->Irm ^= 0xC0;                 /* do 8 sx      */
                }
                else if (((targ_size_t)(targ_schar)a & 0xFFFF) == (a & 0xFFFF))
                    c->Irm ^= 0xC0;                     /* do 8 sx      */
            }

            /* Look for LEA reg,[ireg], replace with MOV reg,ireg       */
            else if (op == 0x8D)
            {   rm = c->Irm & 7;
                mod = c->Irm & modregrm(3,0,0);
                if (mod == 0)
                {
                    if (!I16)
                    {
                        switch (rm)
                        {
                            case 4:
                            case 5:
                                break;
                            default:
                                c->Irm |= modregrm(3,0,0);
                                c->Iop = 0x8B;
                                break;
                        }
                    }
                    else
                    {
                        switch (rm)
                        {
                            case 4:     rm = modregrm(3,0,SI);  goto L6;
                            case 5:     rm = modregrm(3,0,DI);  goto L6;
                            case 7:     rm = modregrm(3,0,BX);  goto L6;
                            L6:     c->Irm = rm + reg;
                                    c->Iop = 0x8B;
                                    break;
                        }
                    }
                }

                /* replace LEA reg,0[BP] with MOV reg,BP        */
                else if (mod == modregrm(1,0,0) && rm == local_BPRM &&
                        c->IFL1 == FLconst && c->IEVpointer1 == 0)
                {       c->Iop = 0x8B;          /* MOV reg,BP   */
                        c->Irm = modregrm(3,0,BP) + reg;
                }
            }

            // Replace [R13] with 0[R13]
            if (c->Irex & REX_B && (c->Irm & modregrm(3,0,5)) == modregrm(0,0,5))
            {
                c->Irm |= modregrm(1,0,0);
                c->IFL1 = FLconst;
                c->IEVpointer1 = 0;
            }
        }
        else if (!(c->Iflags & CFvex))
        {
            switch (op)
            {
                default:
                    if ((op & ~0x0F) != 0x70)
                        break;
                case JMP:
                    switch (c->IFL2)
                    {   case FLcode:
                                if (c->IEV2.Vcode == code_next(c))
                                {       c->Iop = NOP;
                                        continue;
                                }
                                break;
                        case FLblock:
                                if (!code_next(c) && c->IEV2.Vblock == bn)
                                {       c->Iop = NOP;
                                        continue;
                                }
                                break;
                        case FLconst:
                        case FLfunc:
                        case FLextern:
                                break;
                        default:
#ifdef DEBUG
                                WRFL((enum FL)c->IFL2);
#endif
                                assert(0);
                    }
                    break;

                case 0x68:                      // PUSH immed16
                    if (c->IFL2 == FLconst)
                    {
                        targ_long u = c->IEV2.Vuns;
                        if (I64 ||
                            ((c->Iflags & CFopsize) ? I16 : I32))
                        {   // PUSH 32/64 bit operand
                            if (u == (signed char) u)
                                c->Iop = 0x6A;          // PUSH immed8
                        }
                        else // PUSH 16 bit operand
                        {   if ((short)u == (signed char) u)
                                c->Iop = 0x6A;          // PUSH immed8
                        }
                    }
                    break;
            }
        }
  }
#if 0
  if (1 || debugc) {
      printf("-pinholeopt(%p)\n",cstart);
        for (c = cstart; c; c = code_next(c))
            c->print();
  }
#endif
}

#ifdef DEBUG
STATIC void pinholeopt_unittest()
{
    //printf("pinholeopt_unittest()\n");
    struct CS { unsigned model,op,ea,ev1,ev2,flags; } tests[][2] =
    {
        // XOR reg,immed                            NOT regL
        {{ 16,0x81,modregrm(3,6,BX),0,0xFF,0 },    { 0,0xF6,modregrm(3,2,BX),0,0xFF }},

        // MOV 0[BX],3                               MOV [BX],3
        {{ 16,0xC7,modregrm(2,0,7),0,3},           { 0,0xC7,modregrm(0,0,7),0,3 }},

#if 0 // only if config.flags4 & CFG4space
        // TEST regL,immed8
        {{ 0,0xF6,modregrm(3,0,BX),0,0xFF,0 },    { 0,0x84,modregrm(3,BX,BX),0,0xFF }},
        {{ 0,0xF7,modregrm(3,0,BX),0,0xFF,0 },    { 0,0x84,modregrm(3,BX,BX),0,0xFF }},
        {{ 64,0xF6,modregrmx(3,0,R8),0,0xFF,0 },  { 0,0x84,modregxrmx(3,R8,R8),0,0xFF }},
        {{ 64,0xF7,modregrmx(3,0,R8),0,0xFF,0 },  { 0,0x84,modregxrmx(3,R8,R8),0,0xFF }},
#endif

        // PUSH immed => PUSH immed8
        {{ 0,0x68,0,0,0 },    { 0,0x6A,0,0,0 }},
        {{ 0,0x68,0,0,0x7F }, { 0,0x6A,0,0,0x7F }},
        {{ 0,0x68,0,0,0x80 }, { 0,0x68,0,0,0x80 }},
        {{ 16,0x68,0,0,0,CFopsize },    { 0,0x6A,0,0,0,CFopsize }},
        {{ 16,0x68,0,0,0x7F,CFopsize }, { 0,0x6A,0,0,0x7F,CFopsize }},
        {{ 16,0x68,0,0,0x80,CFopsize }, { 0,0x68,0,0,0x80,CFopsize }},
        {{ 16,0x68,0,0,0x10000,0 },     { 0,0x6A,0,0,0x10000,0 }},
        {{ 16,0x68,0,0,0x10000,CFopsize }, { 0,0x68,0,0,0x10000,CFopsize }},
        {{ 32,0x68,0,0,0,CFopsize },    { 0,0x6A,0,0,0,CFopsize }},
        {{ 32,0x68,0,0,0x7F,CFopsize }, { 0,0x6A,0,0,0x7F,CFopsize }},
        {{ 32,0x68,0,0,0x80,CFopsize }, { 0,0x68,0,0,0x80,CFopsize }},
        {{ 32,0x68,0,0,0x10000,CFopsize },    { 0,0x6A,0,0,0x10000,CFopsize }},
        {{ 32,0x68,0,0,0x8000,CFopsize }, { 0,0x68,0,0,0x8000,CFopsize }},
    };

    //config.flags4 |= CFG4space;
    for (int i = 0; i < sizeof(tests)/sizeof(tests[0]); i++)
    {   CS *pin  = &tests[i][0];
        CS *pout = &tests[i][1];
        code cs;
        memset(&cs, 0, sizeof(cs));
        if (pin->model)
        {
            if (I16 && pin->model != 16)
                continue;
            if (I32 && pin->model != 32)
                continue;
            if (I64 && pin->model != 64)
                continue;
        }
        //printf("[%d]\n", i);
        cs.Iop = pin->op;
        cs.Iea = pin->ea;
        cs.IFL1 = FLconst;
        cs.IFL2 = FLconst;
        cs.IEV1.Vuns = pin->ev1;
        cs.IEV2.Vuns = pin->ev2;
        cs.Iflags = pin->flags;
        pinholeopt(&cs, NULL);
        if (cs.Iop != pout->op)
        {   printf("[%d] Iop = x%02x, pout = x%02x\n", i, cs.Iop, pout->op);
            assert(0);
        }
        assert(cs.Iea == pout->ea);
        assert(cs.IEV1.Vuns == pout->ev1);
        assert(cs.IEV2.Vuns == pout->ev2);
        assert(cs.Iflags == pout->flags);
    }
}
#endif

/**************************
 * Compute jump addresses for FLcode.
 * Note: only works for forward referenced code.
 *       only direct jumps and branches are detected.
 *       LOOP instructions only work for backward refs.
 */

void jmpaddr(code *c)
{ code *ci,*cn,*ctarg,*cstart;
  targ_size_t ad;
  unsigned op;

  //printf("jmpaddr()\n");
  cstart = c;                           /* remember start of code       */
  while (c)
  {
        op = c->Iop;
        if (op <= 0xEB &&
            inssize[op] & T &&   // if second operand
            c->IFL2 == FLcode &&
            ((op & ~0x0F) == 0x70 || op == JMP || op == JMPS || op == JCXZ || op == CALL))
        {       ci = code_next(c);
                ctarg = c->IEV2.Vcode;  /* target code                  */
                ad = 0;                 /* IP displacement              */
                while (ci && ci != ctarg)
                {
                        ad += calccodsize(ci);
                        ci = code_next(ci);
                }
                if (!ci)
                    goto Lbackjmp;      // couldn't find it
                if (!I16 || op == JMP || op == JMPS || op == JCXZ || op == CALL)
                        c->IEVpointer2 = ad;
                else                    /* else conditional             */
                {       if (!(c->Iflags & CFjmp16))     /* if branch    */
                                c->IEVpointer2 = ad;
                        else            /* branch around a long jump    */
                        {       cn = code_next(c);
                                code_next(c) = code_calloc();
                                code_next(code_next(c)) = cn;
                                c->Iop = op ^ 1;        /* converse jmp */
                                c->Iflags &= ~CFjmp16;
                                c->IEVpointer2 = I16 ? 3 : 5;
                                cn = code_next(c);
                                cn->Iop = JMP;          /* long jump    */
                                cn->IFL2 = FLconst;
                                cn->IEVpointer2 = ad;
                        }
                }
                c->IFL2 = FLconst;
        }
        if (op == LOOP && c->IFL2 == FLcode)    /* backwards refs       */
        {
            Lbackjmp:
                ctarg = c->IEV2.Vcode;
                for (ci = cstart; ci != ctarg; ci = code_next(ci))
                        if (!ci || ci == c)
                                assert(0);
                ad = 2;                 /* - IP displacement            */
                while (ci != c)
                {       assert(ci);
                        ad += calccodsize(ci);
                        ci = code_next(ci);
                }
                c->IEVpointer2 = (-ad) & 0xFF;
                c->IFL2 = FLconst;
        }
        c = code_next(c);
  }
}

/*******************************
 * Calculate bl->Bsize.
 */

unsigned calcblksize(code *c)
{   unsigned size;

    for (size = 0; c; c = code_next(c))
    {
        unsigned sz = calccodsize(c);
        //printf("off=%02x, sz = %d, code %p: op=%02x\n", size, sz, c, c->Iop);
        size += sz;
    }
//printf("calcblksize(c = x%x) = %d\n", c, size);
    return size;
}

/*****************************
 * Calculate and return code size of a code.
 * Note that NOPs are sometimes used as markers, but are
 * never output. LINNUMs are never output.
 * Note: This routine must be fast. Profiling shows it is significant.
 */

unsigned calccodsize(code *c)
{   unsigned size;
    unsigned op;
    unsigned char rm,mod,ins;
    unsigned iflags;
    unsigned i32 = I32 || I64;
    unsigned a32 = i32;

#ifdef DEBUG
    assert((a32 & ~1) == 0);
#endif
    iflags = c->Iflags;
    op = c->Iop;
    if (iflags & CFvex)
    {
        ins = vex_inssize(c);
        size = ins & 7;
        goto Lmodrm;
    }
    else if ((op & 0xFF00) == 0x0F00 || (op & 0xFFFD00) == 0x0F3800)
        op = 0x0F;
    else
        op &= 0xFF;
    switch (op)
    {
        case 0x0F:
            if ((c->Iop & 0xFFFD00) == 0x0F3800)
            {   // 3 byte op ( 0F38-- or 0F3A-- )
                ins = inssize2[(c->Iop >> 8) & 0xFF];
                size = ins & 7;
                if (c->Iop & 0xFF000000)
                  size++;
            }
            else
            {   // 2 byte op ( 0F-- )
                ins = inssize2[c->Iop & 0xFF];
                size = ins & 7;
                if (c->Iop & 0xFF0000)
                  size++;
            }
            break;

        case NOP:
        case ESCAPE:
            size = 0;                   // since these won't be output
            goto Lret2;

        case ASM:
            if (c->Iflags == CFaddrsize)        // kludge for DA inline asm
                size = NPTRSIZE;
            else
                size = c->IEV1.as.len;
            goto Lret2;

        case 0xA1:
        case 0xA3:
            if (c->Irex)
            {
                size = 9;               // 64 bit immediate value for MOV to/from RAX
                goto Lret;
            }
            goto Ldefault;

        case 0xF6:                      /* TEST mem8,immed8             */
            ins = inssize[op];
            size = ins & 7;
            if (i32)
                size = inssize32[op];
            if ((c->Irm & (7<<3)) == 0)
                size++;                 /* size of immed8               */
            break;

        case 0xF7:
            ins = inssize[op];
            size = ins & 7;
            if (i32)
                size = inssize32[op];
            if ((c->Irm & (7<<3)) == 0)
                size += (i32 ^ ((iflags & CFopsize) !=0)) ? 4 : 2;
            break;

        default:
        Ldefault:
            ins = inssize[op];
            size = ins & 7;
            if (i32)
                size = inssize32[op];
    }

    if (iflags & (CFwait | CFopsize | CFaddrsize | CFSEG))
    {
        if (iflags & CFwait)    // if add FWAIT prefix
            size++;
        if (iflags & CFSEG)     // if segment override
            size++;

        // If the instruction has a second operand that is not an 8 bit,
        // and the operand size prefix is present, then fix the size computation
        // because the operand size will be different.
        // Walter, I had problems with this bit at the end.  There can still be
        // an ADDRSIZE prefix for these and it does indeed change the operand size.

        if (iflags & (CFopsize | CFaddrsize))
        {
            if ((ins & (T|E)) == T)
            {
                if ((op & 0xAC) == 0xA0)
                {
                    if (iflags & CFaddrsize && !I64)
                    {   if (I32)
                            size -= 2;
                        else
                            size += 2;
                    }
                }
                else if (iflags & CFopsize)
                {   if (I16)
                        size += 2;
                    else
                        size -= 2;
                }
            }
            if (iflags & CFaddrsize)
            {   if (!I64)
                    a32 ^= 1;
                size++;
            }
            if (iflags & CFopsize)
                size++;                         /* +1 for OPSIZE prefix         */
        }
    }

Lmodrm:
    if ((op & ~0x0F) == 0x70)
    {   if (iflags & CFjmp16)           // if long branch
            size += I16 ? 3 : 4;        // + 3(4) bytes for JMP
    }
    else if (ins & M)                   // if modregrm byte
    {
        rm = c->Irm;
        mod = rm & 0xC0;
        if (a32 || I64)
        {   // 32 bit addressing
            if (issib(rm))
                size++;
            switch (mod)
            {   case 0:
                    if (issib(rm) && (c->Isib & 7) == 5 ||
                        (rm & 7) == 5)
                        size += 4;      /* disp32                       */
                    if (c->Irex & REX_B && (rm & 7) == 5)
                        /* Instead of selecting R13, this mode is an [RIP] relative
                         * address. Although valid, it's redundant, and should not
                         * be generated. Instead, generate 0[R13] instead of [R13].
                         */
                        assert(0);
                    break;
                case 0x40:
                    size++;             /* disp8                        */
                    break;
                case 0x80:
                    size += 4;          /* disp32                       */
                    break;
            }
        }
        else
        {   // 16 bit addressing
            if (mod == 0x40)            /* 01: 8 bit displacement       */
                size++;
            else if (mod == 0x80 || (mod == 0 && (rm & 7) == 6))
                size += 2;
        }
    }

Lret:
    if (!(iflags & CFvex) && c->Irex)
    {   size++;
        if (c->Irex & REX_W && (op & ~7) == 0xB8)
            size += 4;
    }
Lret2:
    //printf("op = x%02x, size = %d\n",op,size);
    return size;
}

/********************************
 * Return !=0 if codes match.
 */

#if 0

int code_match(code *c1,code *c2)
{   code cs1,cs2;
    unsigned char ins;

    if (c1 == c2)
        goto match;
    cs1 = *c1;
    cs2 = *c2;
    if (cs1.Iop != cs2.Iop)
        goto nomatch;
    switch (cs1.Iop)
    {
        case ESCAPE | ESCctor:
        case ESCAPE | ESCdtor:
            goto nomatch;

        case NOP:
            goto match;

        case ASM:
            if (cs1.IEV1.as.len == cs2.IEV1.as.len &&
                memcmp(cs1.IEV1.as.bytes,cs2.IEV1.as.bytes,cs1.EV1.as.len) == 0)
                goto match;
            else
                goto nomatch;

        default:
            if ((cs1.Iop & 0xFF) == ESCAPE)
                goto match;
            break;
    }
    if (cs1.Iflags != cs2.Iflags)
        goto nomatch;

    ins = inssize[cs1.Iop & 0xFF];
    if ((cs1.Iop & 0xFFFD00) == 0x0F3800)
    {
        ins = inssize2[(cs1.Iop >> 8) & 0xFF];
    }
    else if ((cs1.Iop & 0xFF00) == 0x0F00)
    {
        ins = inssize2[cs1.Iop & 0xFF];
    }

    if (ins & M)                // if modregrm byte
    {
        if (cs1.Irm != cs2.Irm)
            goto nomatch;
        if ((cs1.Irm & 0xC0) == 0xC0)
            goto do2;
        if (is32bitaddr(I32,cs1.Iflags))
        {
            if (issib(cs1.Irm) && cs1.Isib != cs2.Isib)
                goto nomatch;
            if (
                ((rm & 0xC0) == 0 && !((rm & 7) == 4 && (c->Isib & 7) == 5 || (rm & 7) == 5))
               )
                goto do2;       /* if no first operand  */
        }
        else
        {
            if (
                ((rm & 0xC0) == 0 && !((rm & 7) == 6))
               )
                goto do2;       /* if no first operand  */
        }
        if (cs1.IFL1 != cs2.IFL1)
            goto nomatch;
        if (flinsymtab[cs1.IFL1] && cs1.IEVsym1 != cs2.IEVsym1)
            goto nomatch;
        if (cs1.IEVoffset1 != cs2.IEVoffset1)
            goto nomatch;
    }

do2:
    if (!(ins & T))                     // if no second operand
        goto match;
    if (cs1.IFL2 != cs2.IFL2)
        goto nomatch;
    if (flinsymtab[cs1.IFL2] && cs1.IEVsym2 != cs2.IEVsym2)
        goto nomatch;
    if (cs1.IEVoffset2 != cs2.IEVoffset2)
        goto nomatch;

match:
    return 1;

nomatch:
    return 0;
}

#endif

/**************************
 * Write code to intermediate file.
 * Code starts at offset.
 * Returns:
 *      addr of end of code
 */

static targ_size_t offset;              /* to save code use a global    */
static char bytes[100];
static char *pgen;

#define GEN(c)          (*pgen++ = (c))
#define GENP(n,p)       (memcpy(pgen,(p),(n)), pgen += (n))
#if ELFOBJ || MACHOBJ || _MSC_VER
#define FLUSH()         if (pgen-bytes) cod3_flush()
#else
#define FLUSH()         ((pgen - bytes) && cod3_flush())
#endif
#define OFFSET()        (offset + (pgen - bytes))

STATIC void cod3_flush()
{
    // Emit accumulated bytes to code segment
#ifdef DEBUG
    assert(pgen - bytes < sizeof(bytes));
#endif
    offset += obj_bytes(cseg,offset,pgen - bytes,bytes);
    pgen = bytes;
}

unsigned codout(code *c)
{ unsigned op;
  unsigned char rm,mod;
  unsigned char ins;
  code *cn;
  unsigned flags;
  symbol *s;

#ifdef DEBUG
  if (debugc) printf("codout(%p), Coffset = x%llx\n",c,(unsigned long long)Coffset);
#endif

  pgen = bytes;
  offset = Coffset;
  for (; c; c = code_next(c))
  {
#ifdef DEBUG
        if (debugc) { printf("off=%02lx, sz=%ld, ",(long)OFFSET(),(long)calccodsize(c)); c->print(); }
        unsigned startoffset = OFFSET();
#endif
        op = c->Iop;
        ins = inssize[op & 0xFF];
        switch (op & 0xFF)
        {   case ESCAPE:
                /* Check for SSE4 opcode v/pmaxuw xmm1,xmm2/m128 */
                if(op == 0x660F383E || c->Iflags & CFvex) break;

                switch (op & 0xFFFF00)
                {   case ESClinnum:
                        /* put out line number stuff    */
                        objlinnum(c->IEV1.Vsrcpos,OFFSET());
                        break;
#if SCPP
#if 1
                    case ESCctor:
                    case ESCdtor:
                    case ESCoffset:
                        if (config.exe != EX_NT)
                            except_pair_setoffset(c,OFFSET() - funcoffset);
                        break;
                    case ESCmark:
                    case ESCrelease:
                    case ESCmark2:
                    case ESCrelease2:
                        break;
#else
                    case ESCctor:
                        except_push(OFFSET() - funcoffset,c->IEV1.Vtor,NULL);
                        break;
                    case ESCdtor:
                        except_pop(OFFSET() - funcoffset,c->IEV1.Vtor,NULL);
                        break;
                    case ESCmark:
                        except_mark();
                        break;
                    case ESCrelease:
                        except_release();
                        break;
#endif
#endif
                }
#ifdef DEBUG
                assert(calccodsize(c) == 0);
#endif
                continue;
            case NOP:                   /* don't send them out          */
                if (op != NOP)
                    break;
#ifdef DEBUG
                assert(calccodsize(c) == 0);
#endif
                continue;
            case ASM:
                if (op != ASM)
                    break;
                FLUSH();
                if (c->Iflags == CFaddrsize)    // kludge for DA inline asm
                {
                    do32bit(FLblockoff,&c->IEV1,0);
                }
                else
                {
                    offset += obj_bytes(cseg,offset,c->IEV1.as.len,c->IEV1.as.bytes);
                }
#ifdef DEBUG
                assert(calccodsize(c) == c->IEV1.as.len);
#endif
                continue;
        }
        flags = c->Iflags;

        // See if we need to flush (don't have room for largest code sequence)
        if (pgen - bytes > sizeof(bytes) - (1+4+4+8+8))
            FLUSH();

        // see if we need to put out prefix bytes
        if (flags & (CFwait | CFPREFIX | CFjmp16))
        {   int override;

            if (flags & CFwait)
                GEN(0x9B);                      // FWAIT
                                                /* ? SEGES : SEGSS      */
            switch (flags & CFSEG)
            {   case CFes:      override = SEGES;       goto segover;
                case CFss:      override = SEGSS;       goto segover;
                case CFcs:      override = SEGCS;       goto segover;
                case CFds:      override = SEGDS;       goto segover;
                case CFfs:      override = SEGFS;       goto segover;
                case CFgs:      override = SEGGS;       goto segover;
                segover:        GEN(override);
                                break;
            }

            if (flags & CFaddrsize)
                GEN(0x67);

            // Do this last because of instructions like ADDPD
            if (flags & CFopsize)
                GEN(0x66);                      /* operand size         */

            if ((op & ~0x0F) == 0x70 && flags & CFjmp16) /* long condit jmp */
            {
                if (!I16)
                {   // Put out 16 bit conditional jump
                    c->Iop = op = 0x0F00 | (0x80 | (op & 0x0F));
                }
                else
                {
                    cn = code_calloc();
                    /*cxcalloc++;*/
                    code_next(cn) = code_next(c);
                    code_next(c) = cn;          // link into code
                    cn->Iop = JMP;              // JMP block
                    cn->IFL2 = c->IFL2;
                    cn->IEV2.Vblock = c->IEV2.Vblock;
                    c->Iop = op ^= 1;           // toggle condition
                    c->IFL2 = FLconst;
                    c->IEVpointer2 = I16 ? 3 : 5; // skip over JMP block
                    c->Iflags &= ~CFjmp16;
                }
            }
        }

        if (flags & CFvex)
        {
            if (flags & CFvex3)
            {
                GEN(0xC4);
                GEN(VEX3_B1(c->Ivex));
                GEN(VEX3_B2(c->Ivex));
                GEN(c->Ivex.op);
            }
            else
            {
                GEN(0xC5);
                GEN(VEX2_B1(c->Ivex));
                GEN(c->Ivex.op);
            }
            ins = vex_inssize(c);
            goto Lmodrm;
        }

        if (op > 0xFF)
        {
            if ((op & 0xFFFD00) == 0x0F3800)
                ins = inssize2[(op >> 8) & 0xFF];
            else if ((op & 0xFF00) == 0x0F00)
                ins = inssize2[op & 0xFF];

            if (op & 0xFF000000)
            {
                unsigned char op1 = op >> 24;
                if (op1 == 0xF2 || op1 == 0xF3 || op1 == 0x66)
                {
                    GEN(op1);
                    if (c->Irex)
                        GEN(c->Irex | REX);
                }
                else
                {
                    if (c->Irex)
                        GEN(c->Irex | REX);
                    GEN(op1);
                }
                GEN((op >> 16) & 0xFF);
                GEN((op >> 8) & 0xFF);
                GEN(op & 0xFF);
            }
            else if (op & 0xFF0000)
            {
                unsigned char op1 = op >> 16;
                if (op1 == 0xF2 || op1 == 0xF3 || op1 == 0x66)
                {
                    GEN(op1);
                    if (c->Irex)
                        GEN(c->Irex | REX);
                }
                else
                {
                    if (c->Irex)
                        GEN(c->Irex | REX);
                    GEN(op1);
                }
                GEN((op >> 8) & 0xFF);
                GEN(op & 0xFF);
            }
            else
            {
                if (c->Irex)
                    GEN(c->Irex | REX);
                GEN((op >> 8) & 0xFF);
                GEN(op & 0xFF);
            }
        }
        else
        {
            if (c->Irex)
                GEN(c->Irex | REX);
            GEN(op);
        }
  Lmodrm:
        if (ins & M)            /* if modregrm byte             */
        {
            rm = c->Irm;
            GEN(rm);

            // Look for an address size override when working with the
            // MOD R/M and SIB bytes

            if (is32bitaddr( I32, flags))
            {
                if (issib(rm))
                    GEN(c->Isib);
                switch (rm & 0xC0)
                {   case 0x40:
                        do8bit((enum FL) c->IFL1,&c->IEV1);     // 8 bit
                        break;
                    case 0:
                        if (!(issib(rm) && (c->Isib & 7) == 5 ||
                              (rm & 7) == 5))
                            break;
                    case 0x80:
                    {   int flags = CFoff;
                        targ_size_t val = 0;
                        if (I64)
                        {
                            if ((rm & modregrm(3,0,7)) == modregrm(0,0,5))      // if disp32[RIP]
                            {   flags |= CFpc32;
                                val = -4;
                                unsigned reg = rm & modregrm(0,7,0);
                                if (ins & T ||
                                    ((op == 0xF6 || op == 0xF7) && (reg == modregrm(0,0,0) || reg == modregrm(0,1,0))))
                                {   if (ins & E)
                                        val = -5;
                                    else if (c->Iflags & CFopsize)
                                        val = -6;
                                    else
                                        val = -8;
                                }
#if TARGET_OSX
                                // Mach-O linkage already takes the 4 byte size into account
                                val += 4;
#endif
                            }
                        }
                        do32bit((enum FL)c->IFL1,&c->IEV1,flags,val);
                        break;
                    }
                }
            }
            else
            {
                switch (rm & 0xC0)
                {   case 0x40:
                        do8bit((enum FL) c->IFL1,&c->IEV1);     // 8 bit
                        break;
                    case 0:
                        if ((rm & 7) != 6)
                            break;
                    case 0x80:
                        do16bit((enum FL)c->IFL1,&c->IEV1,CFoff);
                        break;
                }
            }
        }
        else
        {
            if (op == 0xC8)
                do16bit((enum FL)c->IFL1,&c->IEV1,0);
        }
        flags &= CFseg | CFoff | CFselfrel;
        if (ins & T)                    /* if second operand            */
        {       if (ins & E)            /* if data-8                    */
                    do8bit((enum FL) c->IFL2,&c->IEV2);
                else if (!I16)
                {
                    switch (op)
                    {   case 0xC2:              /* RETN imm16           */
                        case 0xCA:              /* RETF imm16           */
                        do16:
                            do16bit((enum FL)c->IFL2,&c->IEV2,flags);
                            break;

                        case 0xA1:
                        case 0xA3:
                            if (I64 && c->Irex)
                            {
                        do64:
                                do64bit((enum FL)c->IFL2,&c->IEV2,flags);
                                break;
                            }
                        case 0xA0:              /* MOV AL,byte ptr []   */
                        case 0xA2:
                            if (c->Iflags & CFaddrsize && !I64)
                                goto do16;
                            else
                        do32:
                                do32bit((enum FL)c->IFL2,&c->IEV2,flags);
                            break;
                        case 0x9A:
                        case 0xEA:
                            if (c->Iflags & CFopsize)
                                goto ptr1616;
                            else
                                goto ptr1632;

                        case 0x68:              // PUSH immed32
                            if ((enum FL)c->IFL2 == FLblock)
                            {
                                c->IFL2 = FLblockoff;
                                goto do32;
                            }
                            else
                                goto case_default;

                        case CALL:              // CALL rel
                        case JMP:               // JMP  rel
                            flags |= CFselfrel;
                            goto case_default;

                        default:
                            if ((op|0xF) == 0x0F8F) // Jcc rel16 rel32
                                flags |= CFselfrel;
                            if (I64 && (op & ~7) == 0xB8 && c->Irex & REX_W)
                                goto do64;
                        case_default:
                            if (c->Iflags & CFopsize)
                                goto do16;
                            else
                                goto do32;
                            break;
                    }
                }
                else
                {
                    switch (op) {
                        case 0xC2:
                        case 0xCA:
                            goto do16;
                        case 0xA0:
                        case 0xA1:
                        case 0xA2:
                        case 0xA3:
                            if (c->Iflags & CFaddrsize)
                                goto do32;
                            else
                                goto do16;
                            break;
                        case 0x9A:
                        case 0xEA:
                            if (c->Iflags & CFopsize)
                                goto ptr1632;
                            else
                                goto ptr1616;

                        ptr1616:
                        ptr1632:
                            //assert(c->IFL2 == FLfunc);
                            FLUSH();
                            if (c->IFL2 == FLdatseg)
                            {
                                reftodatseg(cseg,offset,c->IEVpointer2,
                                        c->IEVseg2,flags);
                                offset += 4;
                            }
                            else
                            {
                                s = c->IEVsym2;
                                offset += reftoident(cseg,offset,s,0,flags);
                            }
                            break;

                        case 0x68:              // PUSH immed16
                            if ((enum FL)c->IFL2 == FLblock)
                            {   c->IFL2 = FLblockoff;
                                goto do16;
                            }
                            else
                                goto case_default16;

                        case CALL:
                        case JMP:
                            flags |= CFselfrel;
                        default:
                        case_default16:
                            if (c->Iflags & CFopsize)
                                goto do32;
                            else
                                goto do16;
                            break;
                    }
                }
        }
        else if (op == 0xF6)            /* TEST mem8,immed8             */
        {       if ((rm & (7<<3)) == 0)
                        do8bit((enum FL)c->IFL2,&c->IEV2);
        }
        else if (op == 0xF7)
        {   if ((rm & (7<<3)) == 0)     /* TEST mem16/32,immed16/32     */
            {
                if ((I32 || I64) ^ ((c->Iflags & CFopsize) != 0))
                    do32bit((enum FL)c->IFL2,&c->IEV2,flags);
                else
                    do16bit((enum FL)c->IFL2,&c->IEV2,flags);
            }
        }
#ifdef DEBUG
        if (OFFSET() - startoffset != calccodsize(c))
        {
            printf("actual: %d, calc: %d\n", (int)(OFFSET() - startoffset), (int)calccodsize(c));
            c->print();
            assert(0);
        }
#endif
    }
    FLUSH();
    Coffset = offset;
    //printf("-codout(), Coffset = x%x\n", Coffset);
    return offset;                      /* ending address               */
}


STATIC void do64bit(enum FL fl,union evc *uev,int flags)
{   char *p;
    symbol *s;
    targ_size_t ad;

    assert(I64);
    switch (fl)
    {
        case FLconst:
            ad = * (targ_size_t *) uev;
        L1:
            GENP(8,&ad);
            return;
        case FLdatseg:
            FLUSH();
            reftodatseg(cseg,offset,uev->_EP.Vpointer,uev->_EP.Vseg,CFoffset64 | flags);
            break;
        case FLframehandler:
            framehandleroffset = OFFSET();
            ad = 0;
            goto L1;
        case FLswitch:
            FLUSH();
            ad = uev->Vswitch->Btableoffset;
            if (config.flags & CFGromable)
                    reftocodseg(cseg,offset,ad);
            else
                    reftodatseg(cseg,offset,ad,JMPSEG,CFoff);
            break;
#if TARGET_SEGMENTED
        case FLcsdata:
        case FLfardata:
#if DEBUG
            symbol_print(uev->sp.Vsym);
#endif
#endif
            // NOTE: In ELFOBJ all symbol refs have been tagged FLextern
            // strings and statics are treated like offsets from a
            // un-named external with is the start of .rodata or .data
        case FLextern:                      /* external data symbol         */
        case FLtlsdata:
#if TARGET_LINUX || TARGET_FREEBSD || TARGET_OPENBSD || TARGET_SOLARIS
        case FLgot:
        case FLgotoff:
#endif
            FLUSH();
            s = uev->sp.Vsym;               /* symbol pointer               */
            reftoident(cseg,offset,s,uev->sp.Voffset,CFoffset64 | flags);
            break;

#if TARGET_OSX
        case FLgot:
            funcsym_p->Slocalgotoffset = OFFSET();
            ad = 0;
            goto L1;
#endif

        case FLfunc:                        /* function call                */
            s = uev->sp.Vsym;               /* symbol pointer               */
            assert(TARGET_SEGMENTED || !tyfarfunc(s->ty()));
            FLUSH();
            reftoident(cseg,offset,s,0,CFoffset64 | flags);
            break;

        case FLblock:                       /* displacement to another block */
            ad = uev->Vblock->Boffset - OFFSET() - 4;
            //printf("FLblock: funcoffset = %x, OFFSET = %x, Boffset = %x, ad = %x\n", funcoffset, OFFSET(), uev->Vblock->Boffset, ad);
            goto L1;

        case FLblockoff:
            FLUSH();
            assert(uev->Vblock);
            //printf("FLblockoff: offset = %x, Boffset = %x, funcoffset = %x\n", offset, uev->Vblock->Boffset, funcoffset);
            reftocodseg(cseg,offset,uev->Vblock->Boffset);
            break;

        default:
#ifdef DEBUG
            WRFL(fl);
#endif
            assert(0);
    }
    offset += 8;
}


STATIC void do32bit(enum FL fl,union evc *uev,int flags, targ_size_t val)
{ char *p;
  symbol *s;
  targ_size_t ad;

  //printf("do32bit(flags = x%x)\n", flags);
  switch (fl)
  {
    case FLconst:
        assert(sizeof(targ_size_t) == 4 || sizeof(targ_size_t) == 8);
        ad = * (targ_size_t *) uev;
    L1:
        GENP(4,&ad);
        return;
    case FLdatseg:
        FLUSH();
        reftodatseg(cseg,offset,uev->_EP.Vpointer,uev->_EP.Vseg,flags);
        break;
    case FLframehandler:
        framehandleroffset = OFFSET();
        ad = 0;
        goto L1;
    case FLswitch:
        FLUSH();
        ad = uev->Vswitch->Btableoffset;
        if (config.flags & CFGromable)
                reftocodseg(cseg,offset,ad);
        else
                reftodatseg(cseg,offset,ad,JMPSEG,CFoff);
        break;
#if TARGET_SEGMENTED
    case FLcsdata:
    case FLfardata:
#if DEBUG
        symbol_print(uev->sp.Vsym);
#endif
#endif
        // NOTE: In ELFOBJ all symbol refs have been tagged FLextern
        // strings and statics are treated like offsets from a
        // un-named external with is the start of .rodata or .data
    case FLextern:                      /* external data symbol         */
    case FLtlsdata:
#if TARGET_LINUX || TARGET_FREEBSD || TARGET_OPENBSD || TARGET_SOLARIS
    case FLgot:
    case FLgotoff:
#endif
        FLUSH();
        s = uev->sp.Vsym;               /* symbol pointer               */
        reftoident(cseg,offset,s,uev->sp.Voffset + val,flags);
        break;

#if TARGET_OSX
    case FLgot:
        funcsym_p->Slocalgotoffset = OFFSET();
        ad = 0;
        goto L1;
#endif

    case FLfunc:                        /* function call                */
        s = uev->sp.Vsym;               /* symbol pointer               */
#if TARGET_SEGMENTED
        if (tyfarfunc(s->ty()))
        {       /* Large code references are always absolute    */
                FLUSH();
                offset += reftoident(cseg,offset,s,0,flags) - 4;
        }
        else if (s->Sseg == cseg &&
                 (s->Sclass == SCstatic || s->Sclass == SCglobal) &&
                 s->Sxtrnnum == 0 && flags & CFselfrel)
        {       /* if we know it's relative address     */
                ad = s->Soffset - OFFSET() - 4;
                goto L1;
        }
        else
#endif
        {
                assert(TARGET_SEGMENTED || !tyfarfunc(s->ty()));
                FLUSH();
                reftoident(cseg,offset,s,val,flags);
        }
        break;

    case FLblock:                       /* displacement to another block */
        ad = uev->Vblock->Boffset - OFFSET() - 4;
        //printf("FLblock: funcoffset = %x, OFFSET = %x, Boffset = %x, ad = %x\n", funcoffset, OFFSET(), uev->Vblock->Boffset, ad);
        goto L1;

    case FLblockoff:
        FLUSH();
        assert(uev->Vblock);
        //printf("FLblockoff: offset = %x, Boffset = %x, funcoffset = %x\n", offset, uev->Vblock->Boffset, funcoffset);
        reftocodseg(cseg,offset,uev->Vblock->Boffset);
        break;

    default:
#ifdef DEBUG
        WRFL(fl);
#endif
        assert(0);
  }
  offset += 4;
}


STATIC void do16bit(enum FL fl,union evc *uev,int flags)
{ char *p;
  symbol *s;
  targ_size_t ad;

  switch (fl)
  {
    case FLconst:
        GENP(2,(char *) uev);
        return;
    case FLdatseg:
        FLUSH();
        reftodatseg(cseg,offset,uev->_EP.Vpointer,uev->_EP.Vseg,flags);
        break;
    case FLswitch:
        FLUSH();
        ad = uev->Vswitch->Btableoffset;
        if (config.flags & CFGromable)
                reftocodseg(cseg,offset,ad);
        else
                reftodatseg(cseg,offset,ad,JMPSEG,CFoff);
        break;
#if TARGET_SEGMENTED
    case FLcsdata:
    case FLfardata:
#endif
    case FLextern:                      /* external data symbol         */
    case FLtlsdata:
        assert(SIXTEENBIT || TARGET_SEGMENTED);
        FLUSH();
        s = uev->sp.Vsym;               /* symbol pointer               */
        reftoident(cseg,offset,s,uev->sp.Voffset,flags);
        break;
    case FLfunc:                        /* function call                */
        assert(SIXTEENBIT || TARGET_SEGMENTED);
        s = uev->sp.Vsym;               /* symbol pointer               */
        if (tyfarfunc(s->ty()))
        {       /* Large code references are always absolute    */
                FLUSH();
                offset += reftoident(cseg,offset,s,0,flags) - 2;
        }
        else if (s->Sseg == cseg &&
                 (s->Sclass == SCstatic || s->Sclass == SCglobal) &&
                 s->Sxtrnnum == 0 && flags & CFselfrel)
        {       /* if we know it's relative address     */
                ad = s->Soffset - OFFSET() - 2;
                goto L1;
        }
        else
        {       FLUSH();
                reftoident(cseg,offset,s,0,flags);
        }
        break;
    case FLblock:                       /* displacement to another block */
        ad = uev->Vblock->Boffset - OFFSET() - 2;
#ifdef DEBUG
        {
            targ_ptrdiff_t delta = uev->Vblock->Boffset - OFFSET() - 2;
            assert((signed short)delta == delta);
        }
#endif
    L1:
        GENP(2,&ad);                    // displacement
        return;

    case FLblockoff:
        FLUSH();
        reftocodseg(cseg,offset,uev->Vblock->Boffset);
        break;

    default:
#ifdef DEBUG
        WRFL(fl);
#endif
        assert(0);
  }
  offset += 2;
}

STATIC void do8bit(enum FL fl,union evc *uev)
{ char c;
  targ_ptrdiff_t delta;

  switch (fl)
  {
    case FLconst:
        c = uev->Vuns;
        break;
    case FLblock:
        delta = uev->Vblock->Boffset - OFFSET() - 1;
        if ((signed char)delta != delta)
        {
#if MARS
            if (uev->Vblock->Bsrcpos.Slinnum)
                fprintf(stderr, "%s(%d): ", uev->Vblock->Bsrcpos.Sfilename, uev->Vblock->Bsrcpos.Slinnum);
#endif
            fprintf(stderr, "block displacement of %lld exceeds the maximum offset of -128 to 127.\n", (long long)delta);
            err_exit();
        }
        c = delta;
#ifdef DEBUG
        assert(uev->Vblock->Boffset > OFFSET() || c != 0x7F);
#endif
        break;
    default:
#ifdef DEBUG
        fprintf(stderr,"fl = %d\n",fl);
#endif
        assert(0);
  }
  GEN(c);
}


/**********************************
 */

#if HYDRATE
void code_hydrate(code **pc)
{
    code *c;
    unsigned char ins,rm;
    enum FL fl;

    assert(pc);
    while (*pc)
    {
        c = (code *) ph_hydrate(pc);
        if (c->Iflags & CFvex)
            ins = vex_inssize(c);
        else if ((c->Iop & 0xFFFD00) == 0x0F3800)
            ins = inssize2[(c->Iop >> 8) & 0xFF];
        else if ((c->Iop & 0xFF00) == 0x0F00)
            ins = inssize2[c->Iop & 0xFF];
        else
            ins = inssize[c->Iop & 0xFF];
        switch (c->Iop)
        {
            default:
                break;

            case ESCAPE | ESClinnum:
                srcpos_hydrate(&c->IEV1.Vsrcpos);
                goto done;

            case ESCAPE | ESCctor:
            case ESCAPE | ESCdtor:
                el_hydrate(&c->IEV1.Vtor);
                goto done;

            case ASM:
                ph_hydrate(&c->IEV1.as.bytes);
                goto done;
        }
        if (!(ins & M) ||
            ((rm = c->Irm) & 0xC0) == 0xC0)
            goto do2;           /* if no first operand          */
        if (is32bitaddr(I32,c->Iflags))
        {

            if (
                ((rm & 0xC0) == 0 && !((rm & 7) == 4 && (c->Isib & 7) == 5 || (rm & 7) == 5))
               )
                goto do2;       /* if no first operand  */
        }
        else
        {
            if (
                ((rm & 0xC0) == 0 && !((rm & 7) == 6))
               )
                goto do2;       /* if no first operand  */
        }
        fl = (enum FL) c->IFL1;
        switch (fl)
        {
            case FLudata:
            case FLdata:
            case FLreg:
            case FLauto:
            case FLbprel:
            case FLpara:
#if TARGET_SEGMENTED
            case FLcsdata:
            case FLfardata:
#endif
            case FLtlsdata:
            case FLfunc:
            case FLpseudo:
            case FLextern:
            case FLtmp:
                assert(flinsymtab[fl]);
                symbol_hydrate(&c->IEVsym1);
                symbol_debug(c->IEVsym1);
                break;
            case FLdatseg:
            case FLfltreg:
            case FLallocatmp:
            case FLcs:
            case FLndp:
            case FLoffset:
            case FLlocalsize:
            case FLconst:
            case FLframehandler:
                assert(!flinsymtab[fl]);
                break;
            case FLcode:
                (void) ph_hydrate(&c->IEV1.Vcode);
                break;
            case FLblock:
            case FLblockoff:
                (void) ph_hydrate(&c->IEV1.Vblock);
                break;
#if SCPP
            case FLctor:
            case FLdtor:
                el_hydrate(&c->IEV1.Vtor);
                break;
#endif
            case FLasm:
                (void) ph_hydrate(&c->IEV1.as.bytes);
                break;
            default:
#ifdef DEBUG
                WRFL(fl);
#endif
                assert(0);
                break;
        }
    do2:
        /* Ignore TEST (F6 and F7) opcodes      */
        if (!(ins & T))
            goto done;          /* if no second operand */

        fl = (enum FL) c->IFL2;
        switch (fl)
        {
            case FLudata:
            case FLdata:
            case FLreg:
            case FLauto:
            case FLbprel:
            case FLpara:
#if TARGET_SEGMENTED
            case FLcsdata:
            case FLfardata:
#endif
            case FLtlsdata:
            case FLfunc:
            case FLpseudo:
            case FLextern:
            case FLtmp:
                assert(flinsymtab[fl]);
                symbol_hydrate(&c->IEVsym2);
                symbol_debug(c->IEVsym2);
                break;
            case FLdatseg:
            case FLfltreg:
            case FLallocatmp:
            case FLcs:
            case FLndp:
            case FLoffset:
            case FLlocalsize:
            case FLconst:
            case FLframehandler:
                assert(!flinsymtab[fl]);
                break;
            case FLcode:
                (void) ph_hydrate(&c->IEV2.Vcode);
                break;
            case FLblock:
            case FLblockoff:
                (void) ph_hydrate(&c->IEV2.Vblock);
                break;
            default:
#ifdef DEBUG
                WRFL(fl);
#endif
                assert(0);
                break;
        }
  done:
        ;

        pc = &code_next(c);
    }
}
#endif

/**********************************
 */

#if DEHYDRATE
void code_dehydrate(code **pc)
{
    code *c;
    unsigned char ins,rm;
    enum FL fl;

    while ((c = *pc) != NULL)
    {
        ph_dehydrate(pc);

        if (c->Iflags & CFvex)
            ins = vex_inssize(c);
        else if ((c->Iop & 0xFFFD00) == 0x0F3800)
            ins = inssize2[(c->Iop >> 8) & 0xFF];
        else if ((c->Iop & 0xFF00) == 0x0F00)
            ins = inssize2[c->Iop & 0xFF];
        else
            ins = inssize[c->Iop & 0xFF];
        switch (c->Iop)
        {
            default:
                break;

            case ESCAPE | ESClinnum:
                srcpos_dehydrate(&c->IEV1.Vsrcpos);
                goto done;

            case ESCAPE | ESCctor:
            case ESCAPE | ESCdtor:
                el_dehydrate(&c->IEV1.Vtor);
                goto done;

            case ASM:
                ph_dehydrate(&c->IEV1.as.bytes);
                goto done;
        }

        if (!(ins & M) ||
            ((rm = c->Irm) & 0xC0) == 0xC0)
            goto do2;           /* if no first operand          */
        if (is32bitaddr(I32,c->Iflags))
        {

            if (
                ((rm & 0xC0) == 0 && !((rm & 7) == 4 && (c->Isib & 7) == 5 || (rm & 7) == 5))
               )
                goto do2;       /* if no first operand  */
        }
        else
        {
            if (
                ((rm & 0xC0) == 0 && !((rm & 7) == 6))
               )
                goto do2;       /* if no first operand  */
        }
        fl = (enum FL) c->IFL1;
        switch (fl)
        {
            case FLudata:
            case FLdata:
            case FLreg:
            case FLauto:
            case FLbprel:
            case FLpara:
#if TARGET_SEGMENTED
            case FLcsdata:
            case FLfardata:
#endif
            case FLtlsdata:
            case FLfunc:
            case FLpseudo:
            case FLextern:
            case FLtmp:
                assert(flinsymtab[fl]);
                symbol_dehydrate(&c->IEVsym1);
                break;
            case FLdatseg:
            case FLfltreg:
            case FLallocatmp:
            case FLcs:
            case FLndp:
            case FLoffset:
            case FLlocalsize:
            case FLconst:
            case FLframehandler:
                assert(!flinsymtab[fl]);
                break;
            case FLcode:
                ph_dehydrate(&c->IEV1.Vcode);
                break;
            case FLblock:
            case FLblockoff:
                ph_dehydrate(&c->IEV1.Vblock);
                break;
#if SCPP
            case FLctor:
            case FLdtor:
                el_dehydrate(&c->IEV1.Vtor);
                break;
#endif
            case FLasm:
                ph_dehydrate(&c->IEV1.as.bytes);
                break;
            default:
#ifdef DEBUG
                WRFL(fl);
#endif
                assert(0);
                break;
        }
    do2:
        /* Ignore TEST (F6 and F7) opcodes      */
        if (!(ins & T))
            goto done;          /* if no second operand */

        fl = (enum FL) c->IFL2;
        switch (fl)
        {
            case FLudata:
            case FLdata:
            case FLreg:
            case FLauto:
            case FLbprel:
            case FLpara:
#if TARGET_SEGMENTED
            case FLcsdata:
            case FLfardata:
#endif
            case FLtlsdata:
            case FLfunc:
            case FLpseudo:
            case FLextern:
            case FLtmp:
                assert(flinsymtab[fl]);
                symbol_dehydrate(&c->IEVsym2);
                break;
            case FLdatseg:
            case FLfltreg:
            case FLallocatmp:
            case FLcs:
            case FLndp:
            case FLoffset:
            case FLlocalsize:
            case FLconst:
            case FLframehandler:
                assert(!flinsymtab[fl]);
                break;
            case FLcode:
                ph_dehydrate(&c->IEV2.Vcode);
                break;
            case FLblock:
            case FLblockoff:
                ph_dehydrate(&c->IEV2.Vblock);
                break;
            default:
#ifdef DEBUG
                WRFL(fl);
#endif
                assert(0);
                break;
        }
  done:
        ;
        pc = &code_next(c);
    }
}
#endif

/***************************
 * Debug code to dump code stucture.
 */

#if DEBUG

void WRcodlst(code *c)
{ for (; c; c = code_next(c))
        c->print();
}

void code::print()
{
    unsigned char ins;
    unsigned char rexb;
    code *c = this;

    if (c == CNIL)
    {   printf("code 0\n");
        return;
    }

    unsigned op = c->Iop;
    if (c->Iflags & CFvex)
        ins = vex_inssize(c);
    else if ((c->Iop & 0xFFFD00) == 0x0F3800)
        ins = inssize2[(op >> 8) & 0xFF];
    else if ((c->Iop & 0xFF00) == 0x0F00)
        ins = inssize2[op & 0xFF];
    else
        ins = inssize[op & 0xFF];

    printf("code %p: nxt=%p ",c,code_next(c));

    if (c->Iflags & CFvex)
    {
        if (c->Iflags & CFvex3)
        {   printf("vex=0xC4");
            printf(" 0x%02X", VEX3_B1(c->Ivex));
            printf(" 0x%02X", VEX3_B2(c->Ivex));
            rexb =
                ( c->Ivex.w ? REX_W : 0) |
                (!c->Ivex.r ? REX_R : 0) |
                (!c->Ivex.x ? REX_X : 0) |
                (!c->Ivex.b ? REX_B : 0);
        }
        else
        {   printf("vex=0xC5");
            printf(" 0x%02X", VEX2_B1(c->Ivex));
            rexb = !c->Ivex.r ? REX_R : 0;
        }
        printf(" ");
    }
    else
        rexb = c->Irex;

    if (rexb)
    {   printf("rex=0x%02X ", c->Irex);
        if (rexb & REX_W)
            printf("W");
        if (rexb & REX_R)
            printf("R");
        if (rexb & REX_X)
            printf("X");
        if (rexb & REX_B)
            printf("B");
        printf(" ");
    }
    printf("op=0x%02X",op);

  if ((op & 0xFF) == ESCAPE)
  {     if ((op & 0xFF00) == ESClinnum)
        {   printf(" linnum = %d\n",c->IEV1.Vsrcpos.Slinnum);
            return;
        }
        printf(" ESCAPE %d",c->Iop >> 8);
  }
  if (c->Iflags)
        printf(" flg=%x",c->Iflags);
  if (ins & M)
  {     unsigned rm = c->Irm;
        printf(" rm=0x%02X=%d,%d,%d",rm,(rm>>6)&3,(rm>>3)&7,rm&7);
        if (!I16 && issib(rm))
        {   unsigned char sib = c->Isib;
            printf(" sib=%02x=%d,%d,%d",sib,(sib>>6)&3,(sib>>3)&7,sib&7);
        }
        if ((rm & 0xC7) == BPRM || (rm & 0xC0) == 0x80 || (rm & 0xC0) == 0x40)
        {
            switch (c->IFL1)
            {
                case FLconst:
                case FLoffset:
                    printf(" int = %4d",c->IEV1.Vuns);
                    break;
                case FLblock:
                    printf(" block = %p",c->IEV1.Vblock);
                    break;
                case FLswitch:
                case FLblockoff:
                case FLlocalsize:
                case FLframehandler:
                case 0:
                    break;
                case FLdatseg:
                    printf(" %d.%llx",c->IEVseg1,(unsigned long long)c->IEVpointer1);
                    break;
                case FLauto:
                case FLreg:
                case FLdata:
                case FLudata:
                case FLpara:
                case FLtmp:
                case FLbprel:
                case FLtlsdata:
                    printf(" sym='%s'",c->IEVsym1->Sident);
                    break;
                case FLextern:
                    printf(" FLextern offset = %4d",(int)c->IEVoffset1);
                    break;
                default:
                    WRFL((enum FL)c->IFL1);
                    break;
            }
        }
  }
  if (ins & T)
  {     printf(" "); WRFL((enum FL)c->IFL2);
        switch (c->IFL2)
        {
            case FLconst:
                printf(" int = %4d",c->IEV2.Vuns);
                break;
            case FLblock:
                printf(" block = %p",c->IEV2.Vblock);
                break;
            case FLswitch:
            case FLblockoff:
            case 0:
            case FLlocalsize:
            case FLframehandler:
                break;
            case FLdatseg:
                printf(" %d.%llx",c->IEVseg2,(unsigned long long)c->IEVpointer2);
                break;
            case FLauto:
            case FLreg:
            case FLpara:
            case FLtmp:
            case FLbprel:
            case FLfunc:
            case FLdata:
            case FLudata:
            case FLtlsdata:
                printf(" sym='%s'",c->IEVsym2->Sident);
                break;
            case FLcode:
                printf(" code = %p",c->IEV2.Vcode);
                break;
            default:
                WRFL((enum FL)c->IFL2);
                break;
        }
  }
  printf("\n");
}
#endif

#endif // !SPP