/*
 *
 * Program to convert files between ASCII and UTF8, using the
 * &#-escapes from XML to escape non-ASCII characters.
 *
 * Usage:
 *
 *   xml2asc
 * or
 *   asc2xml
 *
 * Both forms read from stdin and write to stdout. The first form
 * converts from UTF8 (with or without &#-escapes) to ASCII, inserting
 * &#-escapes for all non-ASCII characters. The second form converts
 * from ASCII (with or without &#-escapes) to UTF8, removing all
 * &#-escapes, except those representing ASCII characters.
 *
 * If invoked under any other name, the action is the same as xml2asc.
 *
 * Version: $Revision: 1.3 $ ($Date: 1998/01/19 18:58:17 $)
 * Author: Bert Bos <bert@w3.org>
 *
 * Copyright © World Wide Web Consortium, (Massachusetts Institute of
 * Technology, Institut National de Recherche en Informatique et en
 * Automatique, Keio University). All Rights Reserved.
 *
 * Please read http://www.w3.org/Consortium/Legal/copyright-software.html
 *
 **/

#include <stdio.h>
#include <string.h>
#include <ctype.h>

/* putUTF8 -- write a character to stdout in UTF8 encoding */
static void putUTF8(long c)
{
  if (c <= 0x7F) {				/* Leave ASCII encoded */
    printf("&#%ld;", c);
  } else if (c <= 0x07FF) {			/* 110xxxxx 10xxxxxx */
    putchar(0xC0 | (c >> 6));
    putchar(0x80 | (c & 0x3F));
  } else if (c <= 0xFFFF) {			/* 1110xxxx + 2 */
    putchar(0xE0 | (c >> 12));
    putchar(0x80 | ((c >> 6) & 0x3F));
    putchar(0x80 | (c & 0x3F));
  } else if (c <= 0x1FFFFF) {			/* 11110xxx + 3 */
    putchar(0xF0 | (c >> 18));
    putchar(0x80 | ((c >> 12) & 0x3F));
    putchar(0x80 | ((c >> 6) & 0x3F));
    putchar(0x80 | (c & 0x3F));
  } else if (c <= 0x3FFFFFF) {			/* 111110xx + 4 */
    putchar(0xF8 | (c >> 24));
    putchar(0x80 | ((c >> 18) & 0x3F));
    putchar(0x80 | ((c >> 12) & 0x3F));
    putchar(0x80 | ((c >> 6) & 0x3F));
    putchar(0x80 | (c & 0x3F));
  } else if (c <= 0x7FFFFFFF) {			/* 1111110x + 5 */
    putchar(0xFC | (c >> 30));
    putchar(0x80 | ((c >> 24) & 0x3F));
    putchar(0x80 | ((c >> 18) & 0x3F));
    putchar(0x80 | ((c >> 12) & 0x3F));
    putchar(0x80 | ((c >> 6) & 0x3F));
    putchar(0x80 | (c & 0x3F));
  } else {					/* Not a valid character... */
    printf("&#%ld;", c);
  } 
}

/* asc2xml -- copy stdin to stdout, converting ASCII XML to UTF8 XML */
static void asc2xml(void)
{
  long n;
  int c;
  while ((c = getchar()) != EOF) {
    if (c != '&') {				/* Normal ASCII char */
      putchar(c);
    } else if ((c = getchar()) == EOF) {	/* '&' before EOF */
      putchar('&');
    } else if (c != '#') {			/* '&' not followed by '#' */
      putchar('&');
      putchar(c);
    } else if ((c = getchar()) == 'x') {	/* '&#x' + hexadecimal */
      n = 0;
      while (isxdigit((c = getchar()))) {
	if (c <= '9') n = 16 * n + c - '0';
	else if (c <= 'F') n = 16 * n + c - 'A' + 10;
	else n = 16 * n + c - 'a' + 10;
      }
      /* Don't check for overflow, don't check if c == ';' */
      putUTF8(n);
    } else {					/* '&#' + decimal */
      n = c - '0';
      while (isdigit((c = getchar()))) {
	n = 10 * n + c - '0';
      }
      /* Don't check for overflow, don't check if c == ';' */
      putUTF8(n);
    }
  }
}

/* getUTF8 -- read a UTF8 encoded character from stdin */
static long getUTF8()
{
  long c;
  int b;
  if ((b = getchar()) == EOF) {			/* EOF */
    c = EOF;
  } else if (b <= 0x7F) {			/* ASCII */
    c = b;
  } else if ((b & 0xE0) == 0xC0) {		/* 110xxxxx 10xxxxxx */
    c = (b & 0x1F) << 6;
    b = getchar();				/* Don't check for 10xxxxxx */
    c |= b & 0x3F;
  } else if ((b & 0xF0) == 0xE0) {		/* 1110xxxx + 2 */
    c = (b & 0x0F) << 12;
    b = getchar();
    c |= (b & 0x3F) << 6;
    b = getchar();
    c |= b & 0x3F;
  } else if ((b & 0xF1) == 0xF0) {		/* 11110xxx + 3 */
    c = (b & 0x0F) << 18;
    b = getchar();
    c |= (b & 0x3F) << 12;
    b = getchar();
    c |= (b & 0x3F) << 6;
    b = getchar();
    c |= b & 0x3F;
  } else if ((b & 0xFD) == 0xF8) {		/* 111110xx + 4 */
    c = (b & 0x0F) << 24;
    b = getchar();
    c |= (b & 0x0F) << 18;
    b = getchar();
    c |= (b & 0x3F) << 12;
    b = getchar();
    c |= (b & 0x3F) << 6;
    b = getchar();
    c |= b & 0x3F;
  } else if ((b & 0xFE) == 0xFC) {		/* 1111110x + 5 */
    c = (b & 0x0F) << 30;
    b = getchar();
    c |= (b & 0x0F) << 24;
    b = getchar();
    c |= (b & 0x0F) << 18;
    b = getchar();
    c |= (b & 0x3F) << 12;
    b = getchar();
    c |= (b & 0x3F) << 6;
    b = getchar();
    c |= b & 0x3F;
  } else {
    /* Error */
  }
  return c;
}

/* xml2asc -- copy stdin to stdout, converting UTF8 XML to ASCII XML */
static void xml2asc(void)
{
  long c;
  while ((c = getUTF8()) != EOF) {
    if (c <= 127)
      putchar(c);
    else
      printf("&#%ld;", c);
  }
}

/* Print usage message, then exit */
static void usage(char *progname)
{
  fprintf(stderr, "Usage: %s <infile >outfile\n", progname);
  exit(1);
}

/* main -- main body */
int main(int argc, char *argv[])
{
  if (argc != 1)
    usage(argv[0]);
  if (strcmp(argv[0] + strlen(argv[0]) - 7, "asc2xml") == 0)
    asc2xml();
  else
    xml2asc();
  return 0;
}
