Unicode, UTF-8, and Mac Roman — C Code Example Conversion Functions

Unicode, UTF-8, and Mac Roman — C Code Sample Conversion Functions
//  Public-domain functions for dealing with Unicode, UTF-8, and Mac Roman strings,

//  as used in SWTSG (alienryderflex.com/crawl).

//

//  coded by Darel Rex Finley, 2008




#define  BYTE        unsigned char

#define  UTF8_CHARS  2097152




//  Table to convert the Mac OS Roman characters 0x80-0xFF to Unicode (not UTF-8).

//  Derived from the table at:  http://alanwood.net/demos/macroman.html

long  MacRomanToUnicode[128]={

  196  ,  197,  199,  201,  209,  214,  220,  225,  224,  226,  228,  227,  229,  231,   233,   232,

  234  ,  235,  237,  236,  238,  239,  241,  243,  242,  244,  246,  245,  250,  249,   251,   252,

  8224 ,  176,  162,  163,  167, 8226,  182,  223,  174,  169, 8482,  180,  168, 8800,   198,   216,

  8734 ,  177, 8804, 8805,  165,  181, 8706, 8721, 8719,  960, 8747,  170,  186,  937,   230,   248,

  191  ,  161,  172, 8730,  402, 8776, 8710,  171,  187, 8230,  160,  192,  195,  213,   338,   339,

  8211 , 8212, 8220, 8221, 8216, 8217,  247, 9674,  255,  376, 8260, 8364, 8249, 8250, 64257, 64258,

  8225 ,  183, 8218, 8222, 8240,  194,  202,  193,  203,  200,  205,  206,  207,  204,   211,   212,

  63743,  210,  218,  219,  217,  305,  710,  732,  175,  728,  729,  730,  184,  733,   731,   711  }

;




//  Convert a character from Mac OS Roman text to Unicode.


void convertMacCharToUnicode(long *c) {

  if ((*c)>=128 && (*c)<256) *c=MacRomanToUnicode[(*c)-128]; }




//  Copies a long-null-terminated string of longs, up to a specified number of non-null characters in length.

//

//  Important:  dst should point to an array that has at least max+1 longs.

//

//       Note:  If dst is nil, this function will do nothing -- but if just src is nil, this function will

//              create an empty string-of-longs at dst.


void copyTextLong(long *dst, long *src, long max) {


  long  i=0 ;


  //  Do nothing if the destination is nil.

  if (!dst) return;


  //  If the source is nil, create an empty string at the destination.

  if (!src) {

    *dst=0;

    return; }


  //  Copy the string from the source to the destination.

  while (i<max && src[i]) {

    dst[i]=src[i]; i++; }

  dst  [i]=0; }




//  Copies a null-terminated UTF-8 string to a string of long (4-byte) values, up to a specified

//  number of non-null characters in length.

//

//  Since UTF-8 characters are at most 21 bits (when decoded), the signedness of a plain (signed)

//  long is not an issue.

//

//  This function looks carefully for invalid UTF-8 structure -- if it finds any, or if the UTF-8

//  string was excessively large, then an empty string-of-longs will be returned in longStr.

//

//  Warning:  longStr must have enough space to hold max+1 longs.


void copyTextUtf8ToLongStr(long *longStr, BYTE *utf, long max) {


  long  i, j, count ;

  BYTE  c, d ;


  //  Scan the UTF-8 string, verifying its integrity and discovering its character count.

  //  Note:  This integrity check does not detect characters encoded with more bytes than

  //         necessary.

  i=0; count=0; *longStr=0; c=utf[i++];

  while (c) {

    if ((++count)>2000000000) return;   //  (more than 2 billion characters assumed to be an error)

    if     (c&128) {

      if (!(c& 64)) return;

      d    =utf[i++]; if (d<128 || (d&64)         ) return;

      if   (c& 32) {

        d  =utf[i++]; if (d<128 || (d&64)         ) return;

        if (c& 16) {

          d=utf[i++]; if (d<128 || (d&64) || (c&8)) return; }}}

    c=utf[i++]; }


  //  Translate the UTF-8 string to a string of longs.  (This code plays fast-and-loose with UTF-8

  //  parsing, since the UTF-8 string’s validity and count already have been determined above.)

  i=0; j=0; if (count>max) count=max;

  while (count--) {

    c=utf[i++];

    if      (!(c&128)) {

      longStr[j++]=          c                                                     ;       }    //  1-byte encoding

    else if (!(c& 32)) {

      longStr[j++]=      64*(c&31)+      (utf[i]&63)                               ; i++ ; }    //  2-byte encoding

    else if (!(c& 16)) {

      longStr[j++]=   64*64*(c&15)+   64*(utf[i]&63)+   (utf[i+1]&63)              ; i+=2; }    //  3-byte encoding

    else               {

      longStr[j++]=64*64*64*(c& 7)+64*64*(utf[i]&63)+64*(utf[i+1]&63)+(utf[i+2]&63); i+=3; }}   //  4-byte encoding

  longStr[j]=0; }




//  Converts a null-terminated Mac OS Roman C-string to a long-null-terminated string of Unicode longs.


void convertMacStrToUnicodeLongStr(long *longStr, BYTE *macStr, long max) {


  long  i=0 ;


  while (i<max && macStr[i]) {

    longStr[i]=   macStr[i]; convertMacCharToUnicode(&longStr[i++]); }

  longStr  [i]=0; }




//  Converts a null-terminated string of longs to a null-terminated UTF-8 string.

//

//  Overwrites the same space that the string of longs occupies, destroying that string of longs.

//  Any long value outside the 21-bit range (0 to 2,097,151) will be converted to a character value of 1.

//  The UTF-8 string will be arbitrarily terminated if it reaches about 2 billion bytes (not characters).


void convertLongStrToUtf8(long *longStr) {


  long   i=0, j=0,  c=longStr[i++] ;

  BYTE  *utf=(BYTE *) longStr      ;


  while     (c && j<2000000000) {

    if      (c<    0 || c>=UTF8_CHARS) utf[j++]=1;

    else if (c<  128                 ) utf[j++]=c;    //  1-byte encoding

    else if (c< 2048                 ) {

      utf[j++]=  192+ c              /(      64);     //  2-byte encoding

      utf[j++]=  128+(c&(      64-1))           ; }

    else if (c<65536                 ) {

      utf[j++]=  224+ c              /(   64*64);     //  3-byte encoding

      utf[j++]=  128+(c&(   64*64-1))/(      64);

      utf[j++]=  128+(c&(      64-1))           ; }

    else                               {

      utf[j++]=  240+ c              /(64*64*64);     //  4-byte encoding

      utf[j++]=  128+(c&(64*64*64-1))/(   64*64);

      utf[j++]=  128+(c&(   64*64-1))/(      64);

      utf[j++]=  128+(c&(      64-1))           ; }

    c=longStr[i++]; }

  utf[j]=0; }




//  Compare two strings of longs.  Returns YES if they are identical, otherwise NO.

//

//  Note:  If the string pointers are both nil, this function returns YES, but if only one of them is nil,

//         it returns NO.


bool longStrEqual(long *a, long *b) {


  long  i=0 ;


  //  Handle nil pointers.

  if (!a && !b) return YES;

  if (!a || !b) return  NO;


  //  Compare the string contents.

  while (a[i] && b[i]) {

    if  (a[i] != b[i]) return NO;

    i++; }


  //  Return the result.

  return a[i] == b[i]; }
Send me an e-mail!
Does the brace style in the above code sample freak you out? Click here to see it explained in a new window.
Back to Tutorials.