From b5036e1e355a95019ff3b77cc46da72db1acb184 Mon Sep 17 00:00:00 2001 From: David Nadlinger Date: Fri, 22 Apr 2011 19:06:47 +0200 Subject: [PATCH] Merged DMD commit f30daa9797cf620ba264c0446f0215eb2464ea7a: bugzilla 4389 ICE(constfold.c, expression.c), or wrong code: string~=dchar in CTFE --- dmd/constfold.c | 18 ++++--- dmd/utf.c | 125 ++++++++++++++++++++++++++++++++++++++++++++++++ dmd/utf.h | 9 ++++ 3 files changed, 146 insertions(+), 6 deletions(-) diff --git a/dmd/constfold.c b/dmd/constfold.c index 5e003435..83075573 100644 --- a/dmd/constfold.c +++ b/dmd/constfold.c @@ -25,6 +25,7 @@ #include "expression.h" #include "aggregate.h" #include "declaration.h" +#include "utf.h" #ifdef IN_GCC #include "d-gcc-real.h" @@ -1329,10 +1330,12 @@ Expression *Cat(Type *type, Expression *e1, Expression *e2) if (e1->op == TOKnull && (e2->op == TOKint64 || e2->op == TOKstructliteral)) { e = e2; + t = t1; goto L2; } else if ((e1->op == TOKint64 || e1->op == TOKstructliteral) && e2->op == TOKnull) { e = e1; + t = t2; L2: Type *tn = e->type->toBasetype(); if (tn->ty == Tchar || tn->ty == Twchar || tn->ty == Tdchar) @@ -1340,12 +1343,15 @@ Expression *Cat(Type *type, Expression *e1, Expression *e2) // Create a StringExp void *s; StringExp *es; - size_t len = 1; - int sz = tn->size(); + if (t->nextOf()) + t = t->nextOf()->toBasetype(); + int sz = t->size(); + dinteger_t v = e->toInteger(); + size_t len = utf_codeLength(sz, v); s = mem.malloc((len + 1) * sz); - memcpy((unsigned char *)s, &v, sz); + utf_encode(sz, s, v); // Add terminating 0 memset((unsigned char *)s + len * sz, 0, sz); @@ -1407,13 +1413,13 @@ Expression *Cat(Type *type, Expression *e1, Expression *e2) StringExp *es1 = (StringExp *)e1; StringExp *es; Type *t; - size_t len = es1->len + 1; int sz = es1->sz; dinteger_t v = e2->toInteger(); + size_t len = es1->len + utf_codeLength(sz, v); s = mem.malloc((len + 1) * sz); memcpy(s, es1->string, es1->len * sz); - memcpy((unsigned char *)s + es1->len * sz, &v, sz); + utf_encode(sz, (unsigned char *)s + (sz * es1->len), v); // Add terminating 0 memset((unsigned char *)s + len * sz, 0, sz); @@ -1463,7 +1469,7 @@ Expression *Cat(Type *type, Expression *e1, Expression *e2) if (type->toBasetype()->ty == Tsarray) { - e->type = new TypeSArray(t1->next, new IntegerExp(loc, es1->elements->dim, Type::tindex)); + e->type = new TypeSArray(t1->nextOf(), new IntegerExp(loc, es1->elements->dim, Type::tindex)); e->type = e->type->semantic(loc, NULL); } else diff --git a/dmd/utf.c b/dmd/utf.c index 6467c65b..6350056e 100644 --- a/dmd/utf.c +++ b/dmd/utf.c @@ -11,6 +11,7 @@ // http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8 #include +#include #include #include "utf.h" @@ -21,6 +22,40 @@ int utf_isValidDchar(dchar_t c) (c > 0xDFFF && c <= 0x10FFFF && c != 0xFFFE && c != 0xFFFF); } +static const unsigned char UTF8stride[256] = +{ + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, + 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, + 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, + 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, + 4,4,4,4,4,4,4,4,5,5,5,5,6,6,0xFF,0xFF, +}; + +/** + * stride() returns the length of a UTF-8 sequence starting at index i + * in string s. + * Returns: + * The number of bytes in the UTF-8 sequence or + * 0xFF meaning s[i] is not the start of of UTF-8 sequence. + */ + +unsigned stride(unsigned char* s, size_t i) +{ + unsigned result = UTF8stride[s[i]]; + return result; +} + /******************************************** * Decode a single UTF-8 character sequence. * Returns: @@ -193,3 +228,93 @@ const char *utf_decodeWchar(unsigned short *s, size_t len, size_t *pidx, dchar_t return msg; } +void utf_encodeChar(unsigned char *s, dchar_t c) +{ + if (c <= 0x7F) + { + s[0] = (char) c; + } + else if (c <= 0x7FF) + { + s[0] = (char)(0xC0 | (c >> 6)); + s[1] = (char)(0x80 | (c & 0x3F)); + } + else if (c <= 0xFFFF) + { + s[0] = (char)(0xE0 | (c >> 12)); + s[1] = (char)(0x80 | ((c >> 6) & 0x3F)); + s[2] = (char)(0x80 | (c & 0x3F)); + } + else if (c <= 0x10FFFF) + { + s[0] = (char)(0xF0 | (c >> 18)); + s[1] = (char)(0x80 | ((c >> 12) & 0x3F)); + s[2] = (char)(0x80 | ((c >> 6) & 0x3F)); + s[3] = (char)(0x80 | (c & 0x3F)); + } + else + assert(0); +} + +void utf_encodeWchar(unsigned short *s, dchar_t c) +{ + if (c <= 0xFFFF) + { + s[0] = (wchar_t) c; + } + else + { + s[0] = (wchar_t) ((((c - 0x10000) >> 10) & 0x3FF) + 0xD800); + s[1] = (wchar_t) (((c - 0x10000) & 0x3FF) + 0xDC00); + } +} + + +/** + * Returns the code length of c in the encoding. + * The code is returned in character count, not in bytes. + */ + +int utf_codeLengthChar(dchar_t c) +{ + return + c <= 0x7F ? 1 + : c <= 0x7FF ? 2 + : c <= 0xFFFF ? 3 + : c <= 0x10FFFF ? 4 + : (assert(false), 6); +} + +int utf_codeLengthWchar(dchar_t c) +{ + return c <= 0xFFFF ? 1 : 2; +} + +/** + * Returns the code length of c in the encoding. + * sz is the encoding: 1 = utf8, 2 = utf16, 4 = utf32. + * The code is returned in character count, not in bytes. + */ +int utf_codeLength(int sz, dchar_t c) +{ + if (sz == 1) + return utf_codeLengthChar(c); + if (sz == 2) + return utf_codeLengthWchar(c); + assert(sz == 4); + return 1; +} + +void utf_encode(int sz, void *s, dchar_t c) +{ + if (sz == 1) + utf_encodeChar((unsigned char *)s, c); + else if (sz == 2) + utf_encodeWchar((unsigned short *)s, c); + else + { + assert(sz == 4); + memcpy((unsigned char *)s, &c, sz); + } +} + diff --git a/dmd/utf.h b/dmd/utf.h index 6c9eb06a..21974c2b 100644 --- a/dmd/utf.h +++ b/dmd/utf.h @@ -23,4 +23,13 @@ const char *utf_validateString(unsigned char *s, size_t len); extern int isUniAlpha(dchar_t); +void utf_encodeChar(unsigned char *s, dchar_t c); +void utf_encodeWchar(unsigned short *s, dchar_t c); + +int utf_codeLengthChar(dchar_t c); +int utf_codeLengthWchar(dchar_t c); + +int utf_codeLength(int sz, dchar_t c); +void utf_encode(int sz, void *s, dchar_t c); + #endif