Unicode, UTF-8, and Mac Roman — C Code Sample Conversion Functions



//  Public-domain functions for dealing with Unicode, UTF-8, and Mac Roman strings,
//  as used in SWTSG (alienryderflex.com/crawl).
//
//  coded by Darel Rex Finley, 2008




#define  BYTE        unsigned char
#define  UTF8_CHARS  2097152



//  Table to convert the Mac OS Roman characters 0x80-0xFF to Unicode (not UTF-8).
//  Derived from the table at:  http://alanwood.net/demos/macroman.html

long  MacRomanToUnicode[128]={
  196  ,  197,  199,  201,  209,  214,  220,  225,  224,  226,  228,  227,  229,  231,   233,   232,
  234  ,  235,  237,  236,  238,  239,  241,  243,  242,  244,  246,  245,  250,  249,   251,   252,
  8224 ,  176,  162,  163,  167, 8226,  182,  223,  174,  169, 8482,  180,  168, 8800,   198,   216,
  8734 ,  177, 8804, 8805,  165,  181, 8706, 8721, 8719,  960, 8747,  170,  186,  937,   230,   248,
  191  ,  161,  172, 8730,  402, 8776, 8710,  171,  187, 8230,  160,  192,  195,  213,   338,   339,
  8211 , 8212, 8220, 8221, 8216, 8217,  247, 9674,  255,  376, 8260, 8364, 8249, 8250, 64257, 64258,
  8225 ,  183, 8218, 8222, 8240,  194,  202,  193,  203,  200,  205,  206,  207,  204,   211,   212,
  63743,  210,  218,  219,  217,  305,  710,  732,  175,  728,  729,  730,  184,  733,   731,   711  }
;



//  Convert a character from Mac OS Roman text to Unicode.

void convertMacCharToUnicode(long *c) {
  if ((*c)>=128 && (*c)<256) *c=MacRomanToUnicode[(*c)-128]; }



//  Copies a long-null-terminated string of longs, up to a specificed number of non-null characters in length.
//
//  Important:  dst should point to an array that has at least max+1 longs.
//
//       Note:  If dst is nil, this function will do nothing -- but if just src is nil, this function will
//              create an empty string-of-longs at dst.


void copyTextLong(long *dst, long *src, long max) {

  long  i=0 ;

  //  Do nothing if the destination is nil.
  if (!dst) return;

  //  If the source is nil, create an empty string at the destination.
  if (!src) {
    *dst=0;
    return; }

  //  Copy the string from the source to the destination.
  while (i<max && src[i]) {
    dst[i]=src[i]; i++; }
  dst  [i]=0; }



//  Copies a null-terminated UTF-8 string to a string of long (4-byte) values, up to a specificed
//  number of non-null characters in length.
//
//  Since UTF-8 characters are at most 21 bits (when decoded), the signedness of a plain (signed)
//  long is not an issue.
//
//  This function looks carefully for invalid UTF-8 structure -- if it finds any, or if the UTF-8
//  string was excessively large, then an empty string-of-longs will be returned in longStr.
//
//  Warning:  longStr must have enough space to hold max+1 longs.


void copyTextUtf8ToLongStr(long *longStr, BYTE *utf, long max) {

  long  i, j, count ;
  BYTE  c, d ;

  //  Scan the UTF-8 string, verifying its integrity and discovering its character count.
  //  Note:  This integrity check does not detect characters encoded with more bytes than
  //         necessary.

  i=0; count=0; *longStr=0; c=utf[i++];
  while (c) {
    if ((++count)>2000000000) return;   //  (more than 2 billion characters assumed to be an error)
    if     (c&128) {
      if (!(c& 64)) return;
      d    =utf[i++]; if (d<128 || (d&64)         ) return;
      if   (c& 32) {
        d  =utf[i++]; if (d<128 || (d&64)         ) return;
        if (c& 16) {
          d=utf[i++]; if (d<128 || (d&64) || (c&8)) return; }}}
    c=utf[i++]; }

  //  Translate the UTF-8 string to a string of longs.  (This code plays fast-and-loose with UTF-8
  //  parsing, since the UTF-8 string’s validity and count already have been determined above.)

  i=0; j=0; if (count>max) count=max;
  while (count--) {
    c=utf[i++];
    if      (!(c&128)) {
      longStr[j++]=          c                                                     ;       }    //  1-byte encoding
    else if (!(c& 32)) {
      longStr[j++]=      64*(c&31)+      (utf[i]&63)                               ; i++ ; }    //  2-byte encoding
    else if (!(c& 16)) {
      longStr[j++]=   64*64*(c&15)+   64*(utf[i]&63)+   (utf[i+1]&63)              ; i+=2; }    //  3-byte encoding
    else               {
      longStr[j++]=64*64*64*(c& 7)+64*64*(utf[i]&63)+64*(utf[i+1]&63)+(utf[i+2]&63); i+=3; }}   //  4-byte encoding
  longStr[j]=0; }



//  Converts a null-terminated Mac OS Roman C-string to a long-null-terminated string of Unicode longs.

void convertMacStrToUnicodeLongStr(long *longStr, BYTE *macStr, long max) {

  long  i=0 ;

  while (i<max && macStr[i]) {
    longStr[i]=   macStr[i]; convertMacCharToUnicode(&longStr[i++]); }
  longStr  [i]=0; }



//  Converts a null-terminated string of longs to a null-terminated UTF-8 string.
//
//  Overwrites the same space that the string of longs occupies, destroying that string of longs.
//  Any long value outside the 21-bit range (0 to 2,097,151) will be converted to a character value of 1.
//  The UTF-8 string will be arbitrarily terminated if it reaches about 2 billion bytes (not characters).


void convertLongStrToUtf8(long *longStr) {

  long   i=0, j=0,  c=longStr[i++] ;
  BYTE  *utf=(BYTE *) longStr      ;

  while     (c && j<2000000000) {
    if      (c<    0 || c>=UTF8_CHARS) utf[j++]=1;
    else if (c<  128                 ) utf[j++]=c;    //  1-byte encoding
    else if (c< 2048                 ) {
      utf[j++]=  192+ c              /(      64);     //  2-byte encoding
      utf[j++]=  128+(c&(      64-1))           ; }
    else if (c<65536                 ) {
      utf[j++]=  224+ c              /(   64*64);     //  3-byte encoding
      utf[j++]=  128+(c&(   64*64-1))/(      64);
      utf[j++]=  128+(c&(      64-1))           ; }
    else                               {
      utf[j++]=  240+ c              /(64*64*64);     //  4-byte encoding
      utf[j++]=  128+(c&(64*64*64-1))/(   64*64);
      utf[j++]=  128+(c&(   64*64-1))/(      64);
      utf[j++]=  128+(c&(      64-1))           ; }
    c=longStr[i++]; }
  utf[j]=0; }



//  Compare two strings of longs.  Returns YES if they are identical, otherwise NO.
//
//  Note:  If the string pointers are both nil, this function returns YES, but if only one of them is nil,
//         it returns NO.


bool longStrEqual(long *a, long *b) {

  long  i=0 ;

  //  Handle nil pointers.
  if (!a && !b) return YES;
  if (!a || !b) return  NO;

  //  Compare the string contents.
  while (a[i] && b[i]) {
    if  (a[i] != b[i]) return NO;
    i++; }

  //  Return the result.
  return a[i] == b[i]; }

Send me an e-mail!


Does the brace style in the above code sample freak you out?  Click here to see it explained in a new window.

 

Back to Tutorials.