From b5036e1e355a95019ff3b77cc46da72db1acb184 Mon Sep 17 00:00:00 2001
From: David Nadlinger <code@klickverbot.at>
Date: Fri, 22 Apr 2011 19:06:47 +0200
Subject: [PATCH] Merged DMD commit f30daa9797cf620ba264c0446f0215eb2464ea7a:
 bugzilla 4389 ICE(constfold.c, expression.c), or wrong code: string~=dchar in
 CTFE

---
 dmd/constfold.c |  18 ++++---
 dmd/utf.c       | 125 ++++++++++++++++++++++++++++++++++++++++++++++++
 dmd/utf.h       |   9 ++++
 3 files changed, 146 insertions(+), 6 deletions(-)

diff --git a/dmd/constfold.c b/dmd/constfold.c
index 5e003435..83075573 100644
--- a/dmd/constfold.c
+++ b/dmd/constfold.c
@@ -25,6 +25,7 @@
 #include "expression.h"
 #include "aggregate.h"
 #include "declaration.h"
+#include "utf.h"
 
 #ifdef IN_GCC
 #include "d-gcc-real.h"
@@ -1329,10 +1330,12 @@ Expression *Cat(Type *type, Expression *e1, Expression *e2)
 
     if (e1->op == TOKnull && (e2->op == TOKint64 || e2->op == TOKstructliteral))
     {   e = e2;
+        t = t1;
         goto L2;
     }
     else if ((e1->op == TOKint64 || e1->op == TOKstructliteral) && e2->op == TOKnull)
     {   e = e1;
+        t = t2;
      L2:
         Type *tn = e->type->toBasetype();
         if (tn->ty == Tchar || tn->ty == Twchar || tn->ty == Tdchar)
@@ -1340,12 +1343,15 @@ Expression *Cat(Type *type, Expression *e1, Expression *e2)
             // Create a StringExp
             void *s;
             StringExp *es;
-            size_t len = 1;
-            int sz = tn->size();
+            if (t->nextOf())
+                t = t->nextOf()->toBasetype();
+            int sz = t->size();
+
             dinteger_t v = e->toInteger();
 
+            size_t len = utf_codeLength(sz, v);
             s = mem.malloc((len + 1) * sz);
-            memcpy((unsigned char *)s, &v, sz);
+            utf_encode(sz, s, v);
 
             // Add terminating 0
             memset((unsigned char *)s + len * sz, 0, sz);
@@ -1407,13 +1413,13 @@ Expression *Cat(Type *type, Expression *e1, Expression *e2)
         StringExp *es1 = (StringExp *)e1;
         StringExp *es;
         Type *t;
-        size_t len = es1->len + 1;
         int sz = es1->sz;
         dinteger_t v = e2->toInteger();
 
+        size_t len = es1->len + utf_codeLength(sz, v);
         s = mem.malloc((len + 1) * sz);
         memcpy(s, es1->string, es1->len * sz);
-        memcpy((unsigned char *)s + es1->len * sz, &v, sz);
+        utf_encode(sz, (unsigned char *)s + (sz * es1->len), v);
 
         // Add terminating 0
         memset((unsigned char *)s + len * sz, 0, sz);
@@ -1463,7 +1469,7 @@ Expression *Cat(Type *type, Expression *e1, Expression *e2)
 
         if (type->toBasetype()->ty == Tsarray)
         {
-            e->type = new TypeSArray(t1->next, new IntegerExp(loc, es1->elements->dim, Type::tindex));
+            e->type = new TypeSArray(t1->nextOf(), new IntegerExp(loc, es1->elements->dim, Type::tindex));
             e->type = e->type->semantic(loc, NULL);
         }
         else
diff --git a/dmd/utf.c b/dmd/utf.c
index 6467c65b..6350056e 100644
--- a/dmd/utf.c
+++ b/dmd/utf.c
@@ -11,6 +11,7 @@
 // http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8
 
 #include <stdio.h>
+#include <string.h>
 #include <assert.h>
 
 #include "utf.h"
@@ -21,6 +22,40 @@ int utf_isValidDchar(dchar_t c)
         (c > 0xDFFF && c <= 0x10FFFF && c != 0xFFFE && c != 0xFFFF);
 }
 
+static const unsigned char UTF8stride[256] =
+{
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+    0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
+    0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
+    0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
+    0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
+    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+    3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
+    4,4,4,4,4,4,4,4,5,5,5,5,6,6,0xFF,0xFF,
+};
+
+/**
+ * stride() returns the length of a UTF-8 sequence starting at index i
+ * in string s.
+ * Returns:
+ *  The number of bytes in the UTF-8 sequence or
+ *  0xFF meaning s[i] is not the start of of UTF-8 sequence.
+ */
+
+unsigned stride(unsigned char* s, size_t i)
+{
+    unsigned result = UTF8stride[s[i]];
+    return result;
+}
+
 /********************************************
  * Decode a single UTF-8 character sequence.
  * Returns:
@@ -193,3 +228,93 @@ const char *utf_decodeWchar(unsigned short *s, size_t len, size_t *pidx, dchar_t
     return msg;
 }
 
+void utf_encodeChar(unsigned char *s, dchar_t c)
+{
+    if (c <= 0x7F)
+    {
+        s[0] = (char) c;
+    }
+    else if (c <= 0x7FF)
+    {
+        s[0] = (char)(0xC0 | (c >> 6));
+        s[1] = (char)(0x80 | (c & 0x3F));
+    }
+    else if (c <= 0xFFFF)
+    {
+        s[0] = (char)(0xE0 | (c >> 12));
+        s[1] = (char)(0x80 | ((c >> 6) & 0x3F));
+        s[2] = (char)(0x80 | (c & 0x3F));
+    }
+    else if (c <= 0x10FFFF)
+    {
+        s[0] = (char)(0xF0 | (c >> 18));
+        s[1] = (char)(0x80 | ((c >> 12) & 0x3F));
+        s[2] = (char)(0x80 | ((c >> 6) & 0x3F));
+        s[3] = (char)(0x80 | (c & 0x3F));
+    }
+    else
+        assert(0);
+}
+
+void utf_encodeWchar(unsigned short *s, dchar_t c)
+{
+    if (c <= 0xFFFF)
+    {
+        s[0] = (wchar_t) c;
+    }
+    else
+    {
+        s[0] = (wchar_t) ((((c - 0x10000) >> 10) & 0x3FF) + 0xD800);
+        s[1] = (wchar_t) (((c - 0x10000) & 0x3FF) + 0xDC00);
+    }
+}
+
+
+/**
+ * Returns the code length of c in the encoding.
+ * The code is returned in character count, not in bytes.
+ */
+
+int utf_codeLengthChar(dchar_t c)
+{
+    return
+        c <= 0x7F ? 1
+        : c <= 0x7FF ? 2
+        : c <= 0xFFFF ? 3
+        : c <= 0x10FFFF ? 4
+        : (assert(false), 6);
+}
+
+int utf_codeLengthWchar(dchar_t c)
+{
+    return c <= 0xFFFF ? 1 : 2;
+}
+
+/**
+ * Returns the code length of c in the encoding.
+ * sz is the encoding: 1 = utf8, 2 = utf16, 4 = utf32.
+ * The code is returned in character count, not in bytes.
+ */
+int utf_codeLength(int sz, dchar_t c)
+{
+    if (sz == 1)
+        return utf_codeLengthChar(c);
+    if (sz == 2)
+        return utf_codeLengthWchar(c);
+    assert(sz == 4);
+    return 1;
+}
+
+void utf_encode(int sz, void *s, dchar_t c)
+{
+    if (sz == 1)
+        utf_encodeChar((unsigned char *)s, c);
+    else if (sz == 2)
+        utf_encodeWchar((unsigned short *)s, c);
+    else
+    {
+        assert(sz == 4);
+        memcpy((unsigned char *)s, &c, sz);
+    }
+}
+
diff --git a/dmd/utf.h b/dmd/utf.h
index 6c9eb06a..21974c2b 100644
--- a/dmd/utf.h
+++ b/dmd/utf.h
@@ -23,4 +23,13 @@ const char *utf_validateString(unsigned char *s, size_t len);
 
 extern int isUniAlpha(dchar_t);
 
+void utf_encodeChar(unsigned char *s, dchar_t c);
+void utf_encodeWchar(unsigned short *s, dchar_t c);
+
+int utf_codeLengthChar(dchar_t c);
+int utf_codeLengthWchar(dchar_t c);
+
+int utf_codeLength(int sz, dchar_t c);
+void utf_encode(int sz, void *s, dchar_t c);
+
 #endif