/*
	Copyright (c) 2003, WebThing Ltd
	Author: Nick Kew <nick@webthing.com>

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
he Free Software Foundation; either version 2 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.

*/
#ifndef PARSERFACTORY
#define PARSERFACTORY

//#define UCS4321 (bom_t) { 4, "ucs-4" }
//#define UCS3412 (bom_t) { 4, "ucs-4" }
//#define UCS2143 (bom_t) { 4, "ucs-4" }
//#define UCS1234 (bom_t) { 4, "ucs-4" }
//#define UTF16L	(bom_t) { 2, "utf-16" }
//#define UTF16B	(bom_t) { 2, "utf-16" }
//#define UTF8	(bom_t) { 3, "utf-8" }
//#define NONE	(bom_t) { 0, NULL }
#define NONE 0

class ParserFactory {
  request_rec* r ;
  Transcoder& trans ;
  BasicWriter& out ;
  parsetype sniffed_type ;
  char* lct ;
  typedef struct bom_t {
    size_t bytes ;
    char* name ;
    /*
    const bool operator==(struct bom_t& other) const {
	return ( name == other.name) ;
    }
    const bool operator!=(struct bom_t& other) const {
	return ( name != other.name) ;
    }
    */
  } bom_t ;
  static bom_t UCS4321 ;
  static bom_t UCS3412 ;
  static bom_t UCS2143 ;
  static bom_t UCS1234 ;
  static bom_t UTF16L ;
  static bom_t UTF16B ;
  static bom_t UTF8 ;
//  static bom_t NONE ;
  bom_t* bom ;
  size_t offs ;

  //typedef enum { NONE, UTF8, UTF16L, UTF16B, UCS1234, UCS4321, UCS2143, UCS3412 } bom_type ;

  bom_t* read_bom(char* cbuf, size_t buflen) {
    unsigned char* buf = (unsigned char*) cbuf ;
    if ( (buf[0] == 0) && (buf[1] == 0) )
      if ( (buf[2] == 0xfe) && (buf[3] == 0xff) )
	bom = &UCS1234 ;
      else if ( (buf[2] == 0xff) && (buf[3] == 0xfe) )
	bom = &UCS2143 ;
      else
	bom = NONE ;
    else if ( (buf[0] == 0xfe) && (buf[1] == 0xff) )
      if ( (buf[2] == 0) && (buf[3] == 0) )
	bom = &UCS3412 ;
      else
	bom = &UTF16B ;
    else if ( (buf[0] == 0xff) && (buf[1] == 0xfe) )
      if ( (buf[2] == 0) && (buf[3] == 0) )
	bom = &UCS4321 ;
      else
	bom = &UTF16L ;
    else if ( (buf[0] == 0xef) && (buf[1] == 0xbb) && (buf[2] == 0xbf) )
      bom = &UTF8 ;
    else
      bom = NONE ;
    if ( bom ) {
      offs = bom->bytes ;
      trans.set_encoding(bom->name) ;
    }
    return bom ;
  }

  const void sniff_meta_encoding(char* buf, size_t buflen) const {
    regmatch_t match[2] ;
    regex_t* seek_meta= ap_pregcomp(r->pool,
	"(<meta[^>]*http-equiv[ \t\r\n='\"]*content-type[^>]*>)",
	REG_EXTENDED|REG_ICASE) ;
    if ( ap_regexec(seek_meta, buf, 1, match, 0) == 0 ) {
      char* meta = apr_pstrndup(r->pool, buf + match[0].rm_so,
		match[0].rm_eo - match[0].rm_so) ;
      regex_t* seek_charset=ap_pregcomp(r->pool, "charset=([A-Za-z0-9_-]+)",
		REG_EXTENDED|REG_ICASE) ;
      if ( ap_regexec(seek_charset, meta, 2, match, 0) == 0 )
	trans.set_encoding(apr_pstrndup(r->pool, meta+match[1].rm_so,
		match[1].rm_eo - match[1].rm_so)) ;
      ap_pregfree(r->pool, seek_charset) ;
    }
    ap_pregfree(r->pool, seek_meta) ;
  }

  bool has_xmldecl( char* buf, size_t buflen) {
    regmatch_t match[1] ;
    bool ret = false ;

  const char* str =
	"<\\?xml[ \t\r\n]*version.*\\?>" ;
    // "<\?xml[ \t\r\n]*version[ \t\r\n]*=[ \t\r\n]*[\"']1.0[\"'][^\?]*\?>[ \t\r\n]*"

    regex_t* seek_xmldecl = ap_pregcomp(r->pool, str, 0) ;
	//"<\?xml[ \t\r\n]*version[ \t\r\n]*=[ \t\r\n]*[\"']1.0[\"'][^\?]*\?>[ \t\r\n]*" , 0) ;
    if ( ap_regexec(seek_xmldecl, buf + offs, 1, match, 0) == 0 ) {
      if ( match[0].rm_so == 0 )
	ret = true ;    // ascii-compatible xmldecl with no BOM
      // look for encoding
      char tbr = buf[match[0].rm_eo-2] ;
      buf[match[0].rm_eo-2] = 0 ;
      char* encp = strstr(buf, "encoding") ;
      if ( encp ) {
	encp += 8 ;
	while ( *encp && ! isalnum(*encp) )
	  ++encp ;
	if ( encp ) {
	  char* endp = strchr(encp, encp[-1]) ;
	  if ( endp )
	    trans.set_encoding(apr_pstrndup(r->pool, encp, endp-encp) ) ;
	}
      }
      buf[match[0].rm_eo-2] = tbr ;
      offs += match[0].rm_eo ;
    }
    ap_pregfree(r->pool, seek_xmldecl) ;
    if ( ret && ! trans.encoding() )
      trans.set_encoding(UTF8.name) ;
    return ret ;
  }
  const bool is_appendixc(char* buf, size_t buflen) const {
    regmatch_t match[1] ;
    char* fpi = 0 ;
    regex_t* seek_fpi = ap_pregcomp(r->pool, "<!doctype[^>]+>",
		REG_EXTENDED|REG_ICASE) ;
    if ( ap_regexec(seek_fpi, buf, 1, match, 0) == 0 )
      fpi = apr_pstrndup(r->pool, buf + match[0].rm_so,
		match[0].rm_eo - match[0].rm_so) ;
    ap_pregfree(r->pool, seek_fpi) ;
    if ( fpi ) {
      const char* appendixc[] = {
	"-//W3C//DTD XHTML 1.0 Strict//EN" ,
	"-//W3C//DTD XHTML 1.0 Transitional//EN" ,
	"-//W3C//DTD XHTML 1.0 Frameset//EN" ,
	NULL
      } ;
      for ( const char** decl = appendixc; *decl; ++decl )
	if ( strstr(fpi, *decl) != NULL )
	  return true ;
    }
    return false ;
  }
  const bool is_xml_ctype() const {
    const char* types[] = {
	"application/smil" ,
	"text/vnd.iptc.newsml" ,
	"text/vnd.in3d.3dml" ,
	NULL
    } ;
    if ( ! lct )
	return false ;
    if ( strstr(lct, "xml") != 0 )
	return true ;
    if ( strstr(lct, "vnd.wap") != 0 )
	return true ;
    for (int i = 0; types[i]; ++i)
      if ( ! strcmp(lct, types[i]) )
	return true ;
    return false ;
  }
  const bool is_sgml_ctype() const {
    const char* types[] = {
	NULL
    } ;
    if ( ! lct )
	return false ;
    if ( strstr(lct, "sgml") != 0 )
	return true ;
    for (int i = 0; types[i]; ++i)
      if ( ! strcmp(lct, types[i]) )
	return true ;
    return false ;
  }


public:
  ParserFactory(request_rec* rec, Transcoder& t, BasicWriter& w) :
	r(rec) , trans(t), out(w), sniffed_type(UNSET),
  	lct(0), bom(NONE), offs(0)  {
  }
  ParserFactory(request_rec* rec, Transcoder& t, BasicWriter& w,
	ApacheHTTP& http) : r(rec) , trans(t), out(w),
	sniffed_type(UNSET), bom(NONE), offs(0) {
    lct = apr_pstrdup(r->pool, http.content_type()) ;
    for ( char* c = lct; *c; ++c)
      if ( isupper(*c) )
	*c = tolower(*c) ;
  }
  const char* xmlDecl() const {
// HSivonen bug report - need to include other clauses in this
	return "<?xml version=\"1.0\" encoding=\"utf-8\"?>" ;
  }
#ifdef HAVE_UPLOAD
private:
  bool sniff_doctype(char* buf, size_t buflen) {
    regex_t* seek_doctype = ap_pregcomp(r->pool, "<!DOCTYPE", REG_ICASE) ;
    bool ret = false ;
    if ( ap_regexec(seek_doctype, buf, 0, 0, 0) == 0 )
      ret = true ;
    ap_pregfree(r->pool, seek_doctype) ;
    return ret ;
  }
  parsetype sniff_quiet_(char* buf, size_t buflen) {

    if ( ( read_bom(buf, buflen) != NONE ) || has_xmldecl(buf, buflen) )
	return XML ;

    sniff_meta_encoding(buf, buflen) ;
    if ( is_appendixc(buf, buflen) ) {
      if ( ! trans.encoding() )
	trans.set_encoding("utf-8") ;
      return XHTML ;
    }

    if ( trans.encoding() )
      return HTML ;

    trans.set_encoding("ascii") ;

    if ( sniff_doctype(buf, buflen) )
      return SGML ;
    else {
      out.puts("<val:message>Document doesn't look like any recognised markup type.</val:message>") ;
      return UNSET ;
    }

  }
public:
  parsetype sniff_quiet(char* buf, size_t buflen) {

    char last = buf[buflen-1] ;
    if ( buflen < BUFLEN )
      buf[buflen] = 0 ;
    else
      buf[buflen-1] = 0 ;
    sniffed_type = sniff_quiet_(buf, buflen) ;
    buf[buflen-1] = last ;

    return sniffed_type ;
  }
#endif
  parsetype sniff_doc(char* buf, size_t buflen) {
    char last = buf[buflen-1] ;
    if ( buflen < BUFLEN )
      buf[buflen] = 0 ;
    else
      buf[buflen-1] = 0 ;

    if ( !strcmp( lct, "text/html" ) ) {
      sniffed_type = HTML ;
      if ( ! trans.encoding() )
	sniff_meta_encoding(buf, buflen) ;
      if ( ! trans.encoding() ) {
	out.puts("<val:message>No charset specified, but W3C rules for text/html require either an HTTP header or a META hack.  I'll try provisional validation with the HTTP default iso-8859-1</val:message>") ;
	trans.set_encoding("iso-8859-1") ;       // HTTP default
      }
      if ( is_appendixc(buf, buflen) ) {
	out.puts("<val:message>XHTML document served as text/html - parsing as XHTML under Appendix C rules</val:message>") ;
	sniffed_type = XHTML ;
      } else if ( ( read_bom(buf, buflen) != NONE ) || has_xmldecl(buf, buflen) ) {
	out.puts("<val:message>Start of document looks like XML but is not recognised as XHTML 1.0.  This should not be served as text/html</val:message>") ;
	sniffed_type = XML ;
      }
    } else if ( is_xml_ctype() ) {
      sniffed_type = XML ;
      if ( ( read_bom(buf, buflen) == NONE ) && !has_xmldecl(buf, buflen))
	out.puts("<val:message>Content type suggests XML, but the document doesn't look like XML</val:message>") ;
    } else if ( is_sgml_ctype() ) {
      sniffed_type = SGML ;
      if ( ( read_bom(buf, buflen) != NONE ) && has_xmldecl(buf, buflen))
	out.puts("<val:message>Content type suggests SGML, but the document looks like XML</val:message>") ;
    } else {
      if ( ( read_bom(buf, buflen) == NONE ) && !has_xmldecl(buf, buflen)) {
        out.puts("<val:message>Content type ").escape(lct)
	.puts(" is not recognised as markup, nor does the document look like it.  I won't try to validate this.</val:message>") ;
        sniffed_type = UNSET ;
      } else {
        out.puts("<val:message>Content type ").escape(lct)
	.puts(" is not recognised as markup.  However, the document looks like XML, so I'll try validating as that.</val:message>") ;
        sniffed_type = XML ;
      }
    }
    if ( ! trans.encoding() ) {
      if (sniffed_type == XML)
//	http.set_encoding("xml") ;
	trans.set_encoding("utf-8") ;
      else if (sniffed_type == XHTML) {
	out.puts("<val:message>Since the document is served as text/html, HTML rules take precedence over XML rules.  But I cannot determine charset under HTML rules.  You should fix your server to set the charset explicitly, or serve the document as an XML type.</val:message>") ;
	trans.set_encoding("utf-8") ;
      } else {
	out.puts("<val:message>No charset specified in HTTP, and neither XML nor HTML rules apply.  I'll use the HTTP default, iso-8859-1.</val:message>") ;
	trans.set_encoding("iso-8859-1") ;       // HTTP default
      }
    }
    buf[buflen-1] = last ;
    return sniffed_type ;
  }
  ApacheValidator* selectParser(int resultsMode, apr_table_t* args) const {
    validator_conf* conf = (validator_conf*)
	ap_get_module_config(r->per_dir_config, &validator_module) ;
    parser p = NULL_PARSER ; //conf->defaultparser ;
    parser_rec* pr = 0 ;
    if ( lct )
      for ( pr = conf->plist; pr; pr = pr->next )
	if ( ! strcmp ( lct, pr->ctype ))
	  break ;
    parser dflt = pr ? pr->preferred : conf->defaultparser ;
    parsers ok = pr ? pr->allowed : conf->defaultallowed ;

    const char* pselected = args
		? apr_table_get(args, "parser")
		: getArg(r->pool, r->args, "parser") ;
    if ( pselected )
	if ( !strcmp(pselected, "OpenSP") )
	  p = OpenSP_ ;
	else if ( !strcmp(pselected, "Xerces") )
	  p = Xerces_ ;
    if ( ! (p & ok) ) {
	if ( ( p != NULL_PARSER ) )
	  out.puts("<val:message>Parser ").puts(pselected)
		.puts(" is not allowed for content type ")
		.escape(lct).puts(" by server configuration</val:message>") ;
	p = dflt ;
    }
    if ( ( p == Xerces_ ) &&
		( (sniffed_type == HTML) || (sniffed_type == SGML) ) ) {
	out.puts("<val:message>Xerces cannot parse HTML or SGML documents; using OpenSP</val:message>") ;
	p = OpenSP_ ;
    }
    if ( sniffed_type == UNSET ) {
	p = NULL_PARSER ;
    }
    switch ( p ) {
      case OpenSP_ :
	return new OpenSPValidator(r, out, resultsMode, sniffed_type, args) ;
      case Xerces_ :
	return new XercesValidator(r, out, resultsMode, args) ;
      default:
	out.puts("<val:message>Document type is not supported by this service (no parser available by configuration).</val:message>") ;
	return 0 ;
    }
  }
  const size_t xml_bytes() const { return offs ; }
} ;

ParserFactory::bom_t ParserFactory::UCS4321 = { 4, "ucs4" } ;
ParserFactory::bom_t ParserFactory::UCS3412 = { 4, "ucs4" } ;
ParserFactory::bom_t ParserFactory::UCS2143 = { 4, "ucs4" } ;
ParserFactory::bom_t ParserFactory::UCS1234 = { 4, "ucs-4be" } ;
ParserFactory::bom_t ParserFactory::UTF16L = { 2, "utf16" } ;
ParserFactory::bom_t ParserFactory::UTF16B = { 2, "utf16BE" } ;
ParserFactory::bom_t ParserFactory::UTF8 = { 3, "utf8" } ;
//ParserFactory::bom_t ParserFactory::NONE = { 0, NULL } ;

#endif
