// Public-domain functions for dealing with Unicode, UTF-8, and Mac Roman strings,
// as used in SWTSG (alienryderflex.com/crawl).
//
// coded by Darel Rex Finley, 2008
#define BYTE unsigned char
#define UTF8_CHARS 2097152
// Table to convert the Mac OS Roman characters 0x80-0xFF to Unicode (not UTF-8).
// Derived from the table at: http://alanwood.net/demos/macroman.html
long MacRomanToUnicode[128]={
196 , 197, 199, 201, 209, 214, 220, 225, 224, 226, 228, 227, 229, 231, 233, 232,
234 , 235, 237, 236, 238, 239, 241, 243, 242, 244, 246, 245, 250, 249, 251, 252,
8224 , 176, 162, 163, 167, 8226, 182, 223, 174, 169, 8482, 180, 168, 8800, 198, 216,
8734 , 177, 8804, 8805, 165, 181, 8706, 8721, 8719, 960, 8747, 170, 186, 937, 230, 248,
191 , 161, 172, 8730, 402, 8776, 8710, 171, 187, 8230, 160, 192, 195, 213, 338, 339,
8211 , 8212, 8220, 8221, 8216, 8217, 247, 9674, 255, 376, 8260, 8364, 8249, 8250, 64257, 64258,
8225 , 183, 8218, 8222, 8240, 194, 202, 193, 203, 200, 205, 206, 207, 204, 211, 212,
63743, 210, 218, 219, 217, 305, 710, 732, 175, 728, 729, 730, 184, 733, 731, 711 }
;
// Convert a character from Mac OS Roman text to Unicode.
void convertMacCharToUnicode(long *c) {
if ((*c)>=128 && (*c)<256) *c=MacRomanToUnicode[(*c)-128]; }
// Copies a long-null-terminated string of longs, up to a specified number of non-null characters in length.
//
// Important: dst should point to an array that has at least max+1 longs.
//
// Note: If dst is nil, this function will do nothing -- but if just src is nil, this function will
// create an empty string-of-longs at dst.
void copyTextLong(long *dst, long *src, long max) {
long i=0 ;
// Do nothing if the destination is nil.
if (!dst) return;
// If the source is nil, create an empty string at the destination.
if (!src) {
*dst=0;
return; }
// Copy the string from the source to the destination.
while (i<max && src[i]) {
dst[i]=src[i]; i++; }
dst [i]=0; }
// Copies a null-terminated UTF-8 string to a string of long (4-byte) values, up to a specified
// number of non-null characters in length.
//
// Since UTF-8 characters are at most 21 bits (when decoded), the signedness of a plain (signed)
// long is not an issue.
//
// This function looks carefully for invalid UTF-8 structure -- if it finds any, or if the UTF-8
// string was excessively large, then an empty string-of-longs will be returned in longStr.
//
// Warning: longStr must have enough space to hold max+1 longs.
void copyTextUtf8ToLongStr(long *longStr, BYTE *utf, long max) {
long i, j, count ;
BYTE c, d ;
// Scan the UTF-8 string, verifying its integrity and discovering its character count.
// Note: This integrity check does not detect characters encoded with more bytes than
// necessary.
i=0; count=0; *longStr=0; c=utf[i++];
while (c) {
if ((++count)>2000000000) return; // (more than 2 billion characters assumed to be an error)
if (c&128) {
if (!(c& 64)) return;
d =utf[i++]; if (d<128 || (d&64) ) return;
if (c& 32) {
d =utf[i++]; if (d<128 || (d&64) ) return;
if (c& 16) {
d=utf[i++]; if (d<128 || (d&64) || (c&8)) return; }}}
c=utf[i++]; }
// Translate the UTF-8 string to a string of longs. (This code plays fast-and-loose with UTF-8
// parsing, since the UTF-8 string’s validity and count already have been determined above.)
i=0; j=0; if (count>max) count=max;
while (count--) {
c=utf[i++];
if (!(c&128)) {
longStr[j++]= c ; } // 1-byte encoding
else if (!(c& 32)) {
longStr[j++]= 64*(c&31)+ (utf[i]&63) ; i++ ; } // 2-byte encoding
else if (!(c& 16)) {
longStr[j++]= 64*64*(c&15)+ 64*(utf[i]&63)+ (utf[i+1]&63) ; i+=2; } // 3-byte encoding
else {
longStr[j++]=64*64*64*(c& 7)+64*64*(utf[i]&63)+64*(utf[i+1]&63)+(utf[i+2]&63); i+=3; }} // 4-byte encoding
longStr[j]=0; }
// Converts a null-terminated Mac OS Roman C-string to a long-null-terminated string of Unicode longs.
void convertMacStrToUnicodeLongStr(long *longStr, BYTE *macStr, long max) {
long i=0 ;
while (i<max && macStr[i]) {
longStr[i]= macStr[i]; convertMacCharToUnicode(&longStr[i++]); }
longStr [i]=0; }
// Converts a null-terminated string of longs to a null-terminated UTF-8 string.
//
// Overwrites the same space that the string of longs occupies, destroying that string of longs.
// Any long value outside the 21-bit range (0 to 2,097,151) will be converted to a character value of 1.
// The UTF-8 string will be arbitrarily terminated if it reaches about 2 billion bytes (not characters).
void convertLongStrToUtf8(long *longStr) {
long i=0, j=0, c=longStr[i++] ;
BYTE *utf=(BYTE *) longStr ;
while (c && j<2000000000) {
if (c< 0 || c>=UTF8_CHARS) utf[j++]=1;
else if (c< 128 ) utf[j++]=c; // 1-byte encoding
else if (c< 2048 ) {
utf[j++]= 192+ c /( 64); // 2-byte encoding
utf[j++]= 128+(c&( 64-1)) ; }
else if (c<65536 ) {
utf[j++]= 224+ c /( 64*64); // 3-byte encoding
utf[j++]= 128+(c&( 64*64-1))/( 64);
utf[j++]= 128+(c&( 64-1)) ; }
else {
utf[j++]= 240+ c /(64*64*64); // 4-byte encoding
utf[j++]= 128+(c&(64*64*64-1))/( 64*64);
utf[j++]= 128+(c&( 64*64-1))/( 64);
utf[j++]= 128+(c&( 64-1)) ; }
c=longStr[i++]; }
utf[j]=0; }
// Compare two strings of longs. Returns YES if they are identical, otherwise NO.
//
// Note: If the string pointers are both nil, this function returns YES, but if only one of them is nil,
// it returns NO.
bool longStrEqual(long *a, long *b) {
long i=0 ;
// Handle nil pointers.
if (!a && !b) return YES;
if (!a || !b) return NO;
// Compare the string contents.
while (a[i] && b[i]) {
if (a[i] != b[i]) return NO;
i++; }
// Return the result.
return a[i] == b[i]; }
|